Beispiel #1
0
 def _merged_entry(grp,
                   union=UNION_FIELDS,
                   ignore=IGNORE_FIELDS,
                   raw=False):
     # TODO: consider implementing (a subset of?) onlyifnot logic:
     # {'address': 'publisher', 'lgfamily': 'lgcode', 'publisher': 'school',
     # 'journal': 'booktitle'}
     fields = {
         field: values[0][0] if field not in union else ', '.join(
             unique(vl for vl, fn, bk in values))
         for field, values in grp if field not in ignore
     }
     fields['src'] = ', '.join(
         sorted(
             set(
                 fn.partition('.bib')[0] for field, values in grp
                 for vl, fn, bk in values)))
     fields['srctrickle'] = ', '.join(
         sorted(
             set('%s#%s' % (fn.partition('.bib')[0], bk)
                 for field, values in grp for vl, fn, bk in values)))
     if raw:
         return fields
     entrytype = fields.pop('ENTRYTYPE')
     return entrytype, fields
Beispiel #2
0
 def show_merges(self):
     with self.connect() as conn:
         cursor = conn.execute('SELECT hash, refid, filename, bibkey '
         'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
         'WHERE hash = e.hash AND refid != e.refid) '
         'ORDER BY hash, refid DESC, filename, bibkey')
         for hash, group in group_first(cursor):
             self.print_group(conn, group)
             new = self._merged_entry(self._entrygrp(conn, hash), raw=True)
             cand = [(ri, self._merged_entry(self._entrygrp(conn, ri), raw=True))
                 for ri in unique(ri for hs, ri, fn, bk in group)]
             old = min(cand, key=lambda p: distance(new, p[1]))[0]
             print('-> %s\n' % old)
Beispiel #3
0
 def show_splits(self):
     with self.connect() as conn:
         cursor = conn.execute('SELECT refid, hash, filename, bibkey '
         'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
         'WHERE refid = e.refid AND hash != e.hash) '
         'ORDER BY refid, hash, filename, bibkey')
         for refid, group in group_first(cursor):
             self.print_group(conn, group)
             old = self._merged_entry(self._entrygrp(conn, refid), raw=True)
             cand = [(hs, self._merged_entry(self._entrygrp(conn, hs), raw=True))
                 for hs in unique(hs for ri, hs, fn, bk in group)]
             new = min(cand, key=lambda p: distance(old, p[1]))[0]
             print('-> %s\n' % new)
Beispiel #4
0
 def _merged_entry(grp, union=UNION_FIELDS, ignore=IGNORE_FIELDS, raw=False):
     # TODO: consider implementing (a subset of?) onlyifnot logic:
     # {'address': 'publisher', 'lgfamily': 'lgcode', 'publisher': 'school', 'journal': 'booktitle'}
     fields = {field: values[0][0] if field not in union
         else ', '.join(unique(vl for vl, fn, bk in values))
         for field, values in grp if field not in ignore}
     fields['src'] = ', '.join(sorted(set(fn.partition('.bib')[0]
         for field, values in grp for vl, fn, bk in values)))
     fields['srctrickle'] = ', '.join(sorted(set('%s#%s' % (fn.partition('.bib')[0], bk)
         for field, values in grp for vl, fn, bk in values)))
     if raw:
         return fields
     entrytype = fields.pop('ENTRYTYPE')
     return entrytype, fields
Beispiel #5
0
 def show_merges(self):
     with self.connect() as conn:
         cursor = conn.execute(
             'SELECT hash, refid, filename, bibkey '
             'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
             'WHERE hash = e.hash AND refid != e.refid) '
             'ORDER BY hash, refid DESC, filename, bibkey')
         for hash, group in group_first(cursor):
             self.print_group(conn, group)
             new = self._merged_entry(self._entrygrp(conn, hash), raw=True)
             cand = [(ri,
                      self._merged_entry(self._entrygrp(conn, ri),
                                         raw=True))
                     for ri in unique(ri for hs, ri, fn, bk in group)]
             old = min(cand, key=lambda p: distance(new, p[1]))[0]
             print('-> %s\n' % old)
Beispiel #6
0
 def show_splits(self):
     with self.connect() as conn:
         cursor = conn.execute(
             'SELECT refid, hash, filename, bibkey '
             'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
             'WHERE refid = e.refid AND hash != e.hash) '
             'ORDER BY refid, hash, filename, bibkey')
         for refid, group in group_first(cursor):
             self.print_group(conn, group)
             old = self._merged_entry(self._entrygrp(conn, refid), raw=True)
             cand = [(hs,
                      self._merged_entry(self._entrygrp(conn, hs),
                                         raw=True))
                     for hs in unique(hs for ri, hs, fn, bk in group)]
             new = min(cand, key=lambda p: distance(old, p[1]))[0]
             print('-> %s\n' % new)
Beispiel #7
0
def keyid(fields, fd, ti=2, infinity=float('inf')):
    if 'author' not in fields:
        if 'editor' not in fields:
            values = ''.join(v for f, v in bibord_iteritems(fields)
                             if f != 'glottolog_ref_id')
            return '__missingcontrib__' + reokkey.sub('_', values.lower())
        else:
            astring = fields['editor']
    else:
        astring = fields['author']

    authors = pauthor(astring)
    if len(authors) != len(astring.split(' and ')):
        print("Unparsed author in", authors)
        print("   ", astring, astring.split(' and '))
        print(fields.get('title'))

    ak = [
        undiacritic(x) for x in sorted(
            lastnamekey(a['lastname']) for a in authors)
    ]
    yk = pyear(fields.get('year', '[nd]'))[:4]
    tks = wrds(fields.get("title", "no.title"))  # takeuntil :
    # select the (leftmost) two least frequent words from the title
    types = list(unique(w for w in tks if rewrdtok.match(w)))
    tk = nsmallest(ti, types, key=lambda w: fd.get(w, infinity))
    # put them back into the title order (i.e. 'spam eggs' != 'eggs spam')
    order = {w: i for i, w in enumerate(types)}
    tk.sort(key=lambda w: order[w])
    if 'volume' in fields and all(f not in fields
                                  for f in ['journal', 'booktitle', 'series']):
        vk = roman(fields['volume'])
    else:
        vk = ''

    if 'extra_hash' in fields:
        yk = yk + fields['extra_hash']

    key = '-'.join(ak) + "_" + '-'.join(tk) + vk + yk
    return reokkey.sub("", key.lower())
Beispiel #8
0
def keyid(fields, fd, ti=2, infinity=float('inf')):
    if 'author' not in fields:
        if 'editor' not in fields:
            values = ''.join(
                v for f, v in bibord_iteritems(fields) if f != 'glottolog_ref_id')
            return '__missingcontrib__' + reokkey.sub('_', values.lower())
        else:
            astring = fields['editor']
    else:
        astring = fields['author']

    authors = pauthor(astring)
    if len(authors) != len(astring.split(' and ')):
        print("Unparsed author in", authors)
        print("   ", astring, astring.split(' and '))
        print(fields.get('title'))

    ak = [undiacritic(x) for x in sorted(lastnamekey(a['lastname']) for a in authors)]
    yk = pyear(fields.get('year', '[nd]'))[:4]
    tks = wrds(fields.get("title", "no.title"))  # takeuntil :
    # select the (leftmost) two least frequent words from the title
    types = list(unique(w for w in tks if rewrdtok.match(w)))
    tk = nsmallest(ti, types, key=lambda w: fd.get(w, infinity))
    # put them back into the title order (i.e. 'spam eggs' != 'eggs spam')
    order = {w: i for i, w in enumerate(types)}
    tk.sort(key=lambda w: order[w])
    if 'volume' in fields and all(
            f not in fields for f in ['journal', 'booktitle', 'series']):
        vk = roman(fields['volume'])
    else:
        vk = ''

    if 'extra_hash' in fields:
        yk = yk + fields['extra_hash']

    key = '-'.join(ak) + "_" + '-'.join(tk) + vk + yk
    return reokkey.sub("", key.lower())
Beispiel #9
0
def assign_ids(conn, verbose=False):
    merged_entry, entrygrp = Database._merged_entry, Database._entrygrp

    allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry '
        'WHERE hash IS NULL)').fetchone()
    assert allhash

    print('%d entries' % conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount)

    # resolve splits: srefid = refid only for entries from the most similar hash group
    nsplit = 0
    cursor = conn.execute('SELECT refid, hash, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) '
        'ORDER BY refid, hash, filename, bibkey')
    for refid, group in group_first(cursor):
        old = merged_entry(entrygrp(conn, refid), raw=True)
        nsplit += len(group)
        cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True))
            for hs in unique(hs for ri, hs, fn, bk in group)]
        new = min(cand, key=lambda p: distance(old, p[1]))[0]
        separated = conn.execute('UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?',
            (refid, new)).rowcount
        if verbose:
            for row in group:
                print(row)
            for ri, hs, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % new)
            print('%d: %d separated from %s\n' % (refid, separated, new))
    print('%d splitted' % nsplit)
    
    nosplits, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))').fetchone()
    assert nosplits

    # resolve merges: id = srefid of the most similar srefid group
    nmerge = 0
    cursor = conn.execute('SELECT hash, srefid, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) '
        'ORDER BY hash, srefid DESC, filename, bibkey')
    for hash, group in group_first(cursor):
        new = merged_entry(entrygrp(conn, hash), raw=True)
        nmerge += len(group)
        cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True))
            for ri in unique(ri for hs, ri, fn, bk in group)]
        old = min(cand, key=lambda p: distance(new, p[1]))[0]
        merged = conn.execute('UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?',
            (old, hash, old)).rowcount
        if verbose:
            for row in group:
                print(row)
            for hs, ri, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % old)
            print('%s: %d merged into %d\n' % (hash, merged, old))
    print('%d merged' % nmerge)

    # unchanged entries
    print('%d unchanged' % conn.execute('UPDATE entry SET id = srefid '
        'WHERE id IS NULL AND srefid IS NOT NULL').rowcount)

    nomerges, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))').fetchone()
    assert nomerges

    # identified
    print('%d identified (new/separated)' % conn.execute('UPDATE entry '
        'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) '
        'WHERE refid IS NULL AND id IS NULL AND EXISTS '
        '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)').rowcount)

    # assign new ids to hash groups of separated/new entries
    nextid, = conn.execute('SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone()
    cursor = conn.execute('SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash')
    print('%d new ids (new/separated)' % conn.executemany('UPDATE entry SET id = ? WHERE hash = ?',
        ((id, hash) for id, (hash,) in enumerate(cursor, nextid))).rowcount)

    assert allid(conn)
    assert onetoone(conn)

    # supersede relation
    superseded, = conn.execute('SELECT count(*) FROM entry WHERE id != srefid').fetchone()
    print('%d supersede pairs' % superseded)
Beispiel #10
0
def assign_ids(conn, verbose=False):
    merged_entry, entrygrp = Database._merged_entry, Database._entrygrp

    allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry '
                            'WHERE hash IS NULL)').fetchone()
    assert allhash

    print('%d entries' %
          conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount)

    # resolve splits: srefid = refid only for entries from the most similar hash group
    nsplit = 0
    cursor = conn.execute(
        'SELECT refid, hash, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) '
        'ORDER BY refid, hash, filename, bibkey')
    for refid, group in group_first(cursor):
        old = merged_entry(entrygrp(conn, refid), raw=True)
        nsplit += len(group)
        cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True))
                for hs in unique(hs for ri, hs, fn, bk in group)]
        new = min(cand, key=lambda (hs, fields): distance(old, fields))[0]
        separated = conn.execute(
            'UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?',
            (refid, new)).rowcount
        if verbose:
            for row in group:
                print(row)
            for ri, hs, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % new)
            print('%d: %d separated from %s\n' % (refid, separated, new))
    print('%d splitted' % nsplit)

    nosplits, = conn.execute(
        'SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))'
    ).fetchone()
    assert nosplits

    # resolve merges: id = srefid of the most similar srefid group
    nmerge = 0
    cursor = conn.execute(
        'SELECT hash, srefid, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) '
        'ORDER BY hash, srefid DESC, filename, bibkey')
    for hash, group in group_first(cursor):
        new = merged_entry(entrygrp(conn, hash), raw=True)
        nmerge += len(group)
        cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True))
                for ri in unique(ri for hs, ri, fn, bk in group)]
        old = min(cand, key=lambda (ri, fields): distance(new, fields))[0]
        merged = conn.execute(
            'UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?',
            (old, hash, old)).rowcount
        if verbose:
            for row in group:
                print(row)
            for hs, ri, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % old)
            print('%s: %d merged into %d\n' % (hash, merged, old))
    print('%d merged' % nmerge)

    # unchanged entries
    print('%d unchanged' %
          conn.execute('UPDATE entry SET id = srefid '
                       'WHERE id IS NULL AND srefid IS NOT NULL').rowcount)

    nomerges, = conn.execute(
        'SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))'
    ).fetchone()
    assert nomerges

    # identified
    print('%d identified (new/separated)' % conn.execute(
        'UPDATE entry '
        'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) '
        'WHERE refid IS NULL AND id IS NULL AND EXISTS '
        '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)'
    ).rowcount)

    # assign new ids to hash groups of separated/new entries
    nextid, = conn.execute(
        'SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone()
    cursor = conn.execute(
        'SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash')
    print('%d new ids (new/separated)' %
          conn.executemany('UPDATE entry SET id = ? WHERE hash = ?', (
              (id, hash)
              for id, (hash, ) in enumerate(cursor, nextid))).rowcount)

    assert allid(conn)
    assert onetoone(conn)

    # supersede relation
    superseded, = conn.execute(
        'SELECT count(*) FROM entry WHERE id != srefid').fetchone()
    print('%d supersede pairs' % superseded)
Beispiel #11
0
    def test_unique(self):
        from pyglottolog.util import unique

        l = [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6]
        self.assertEqual(len(set(l)), len(list(unique(l))))
Beispiel #12
0
def test_unique():
    assert list(util.unique([1, 2, 1, 2, 3])) == [1, 2, 3]
Beispiel #13
0
    def test_unique(self):
        from pyglottolog.util import unique

        l = [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6]
        self.assertEqual(len(set(l)), len(list(unique(l))))