def _merged_entry(grp, union=UNION_FIELDS, ignore=IGNORE_FIELDS, raw=False): # TODO: consider implementing (a subset of?) onlyifnot logic: # {'address': 'publisher', 'lgfamily': 'lgcode', 'publisher': 'school', # 'journal': 'booktitle'} fields = { field: values[0][0] if field not in union else ', '.join( unique(vl for vl, fn, bk in values)) for field, values in grp if field not in ignore } fields['src'] = ', '.join( sorted( set( fn.partition('.bib')[0] for field, values in grp for vl, fn, bk in values))) fields['srctrickle'] = ', '.join( sorted( set('%s#%s' % (fn.partition('.bib')[0], bk) for field, values in grp for vl, fn, bk in values))) if raw: return fields entrytype = fields.pop('ENTRYTYPE') return entrytype, fields
def show_merges(self): with self.connect() as conn: cursor = conn.execute('SELECT hash, refid, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE hash = e.hash AND refid != e.refid) ' 'ORDER BY hash, refid DESC, filename, bibkey') for hash, group in group_first(cursor): self.print_group(conn, group) new = self._merged_entry(self._entrygrp(conn, hash), raw=True) cand = [(ri, self._merged_entry(self._entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda p: distance(new, p[1]))[0] print('-> %s\n' % old)
def show_splits(self): with self.connect() as conn: cursor = conn.execute('SELECT refid, hash, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): self.print_group(conn, group) old = self._merged_entry(self._entrygrp(conn, refid), raw=True) cand = [(hs, self._merged_entry(self._entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda p: distance(old, p[1]))[0] print('-> %s\n' % new)
def _merged_entry(grp, union=UNION_FIELDS, ignore=IGNORE_FIELDS, raw=False): # TODO: consider implementing (a subset of?) onlyifnot logic: # {'address': 'publisher', 'lgfamily': 'lgcode', 'publisher': 'school', 'journal': 'booktitle'} fields = {field: values[0][0] if field not in union else ', '.join(unique(vl for vl, fn, bk in values)) for field, values in grp if field not in ignore} fields['src'] = ', '.join(sorted(set(fn.partition('.bib')[0] for field, values in grp for vl, fn, bk in values))) fields['srctrickle'] = ', '.join(sorted(set('%s#%s' % (fn.partition('.bib')[0], bk) for field, values in grp for vl, fn, bk in values))) if raw: return fields entrytype = fields.pop('ENTRYTYPE') return entrytype, fields
def show_merges(self): with self.connect() as conn: cursor = conn.execute( 'SELECT hash, refid, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE hash = e.hash AND refid != e.refid) ' 'ORDER BY hash, refid DESC, filename, bibkey') for hash, group in group_first(cursor): self.print_group(conn, group) new = self._merged_entry(self._entrygrp(conn, hash), raw=True) cand = [(ri, self._merged_entry(self._entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda p: distance(new, p[1]))[0] print('-> %s\n' % old)
def show_splits(self): with self.connect() as conn: cursor = conn.execute( 'SELECT refid, hash, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): self.print_group(conn, group) old = self._merged_entry(self._entrygrp(conn, refid), raw=True) cand = [(hs, self._merged_entry(self._entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda p: distance(old, p[1]))[0] print('-> %s\n' % new)
def keyid(fields, fd, ti=2, infinity=float('inf')): if 'author' not in fields: if 'editor' not in fields: values = ''.join(v for f, v in bibord_iteritems(fields) if f != 'glottolog_ref_id') return '__missingcontrib__' + reokkey.sub('_', values.lower()) else: astring = fields['editor'] else: astring = fields['author'] authors = pauthor(astring) if len(authors) != len(astring.split(' and ')): print("Unparsed author in", authors) print(" ", astring, astring.split(' and ')) print(fields.get('title')) ak = [ undiacritic(x) for x in sorted( lastnamekey(a['lastname']) for a in authors) ] yk = pyear(fields.get('year', '[nd]'))[:4] tks = wrds(fields.get("title", "no.title")) # takeuntil : # select the (leftmost) two least frequent words from the title types = list(unique(w for w in tks if rewrdtok.match(w))) tk = nsmallest(ti, types, key=lambda w: fd.get(w, infinity)) # put them back into the title order (i.e. 'spam eggs' != 'eggs spam') order = {w: i for i, w in enumerate(types)} tk.sort(key=lambda w: order[w]) if 'volume' in fields and all(f not in fields for f in ['journal', 'booktitle', 'series']): vk = roman(fields['volume']) else: vk = '' if 'extra_hash' in fields: yk = yk + fields['extra_hash'] key = '-'.join(ak) + "_" + '-'.join(tk) + vk + yk return reokkey.sub("", key.lower())
def keyid(fields, fd, ti=2, infinity=float('inf')): if 'author' not in fields: if 'editor' not in fields: values = ''.join( v for f, v in bibord_iteritems(fields) if f != 'glottolog_ref_id') return '__missingcontrib__' + reokkey.sub('_', values.lower()) else: astring = fields['editor'] else: astring = fields['author'] authors = pauthor(astring) if len(authors) != len(astring.split(' and ')): print("Unparsed author in", authors) print(" ", astring, astring.split(' and ')) print(fields.get('title')) ak = [undiacritic(x) for x in sorted(lastnamekey(a['lastname']) for a in authors)] yk = pyear(fields.get('year', '[nd]'))[:4] tks = wrds(fields.get("title", "no.title")) # takeuntil : # select the (leftmost) two least frequent words from the title types = list(unique(w for w in tks if rewrdtok.match(w))) tk = nsmallest(ti, types, key=lambda w: fd.get(w, infinity)) # put them back into the title order (i.e. 'spam eggs' != 'eggs spam') order = {w: i for i, w in enumerate(types)} tk.sort(key=lambda w: order[w]) if 'volume' in fields and all( f not in fields for f in ['journal', 'booktitle', 'series']): vk = roman(fields['volume']) else: vk = '' if 'extra_hash' in fields: yk = yk + fields['extra_hash'] key = '-'.join(ak) + "_" + '-'.join(tk) + vk + yk return reokkey.sub("", key.lower())
def assign_ids(conn, verbose=False): merged_entry, entrygrp = Database._merged_entry, Database._entrygrp allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry ' 'WHERE hash IS NULL)').fetchone() assert allhash print('%d entries' % conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount) # resolve splits: srefid = refid only for entries from the most similar hash group nsplit = 0 cursor = conn.execute('SELECT refid, hash, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): old = merged_entry(entrygrp(conn, refid), raw=True) nsplit += len(group) cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda p: distance(old, p[1]))[0] separated = conn.execute('UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?', (refid, new)).rowcount if verbose: for row in group: print(row) for ri, hs, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % new) print('%d: %d separated from %s\n' % (refid, separated, new)) print('%d splitted' % nsplit) nosplits, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))').fetchone() assert nosplits # resolve merges: id = srefid of the most similar srefid group nmerge = 0 cursor = conn.execute('SELECT hash, srefid, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) ' 'ORDER BY hash, srefid DESC, filename, bibkey') for hash, group in group_first(cursor): new = merged_entry(entrygrp(conn, hash), raw=True) nmerge += len(group) cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda p: distance(new, p[1]))[0] merged = conn.execute('UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?', (old, hash, old)).rowcount if verbose: for row in group: print(row) for hs, ri, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % old) print('%s: %d merged into %d\n' % (hash, merged, old)) print('%d merged' % nmerge) # unchanged entries print('%d unchanged' % conn.execute('UPDATE entry SET id = srefid ' 'WHERE id IS NULL AND srefid IS NOT NULL').rowcount) nomerges, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))').fetchone() assert nomerges # identified print('%d identified (new/separated)' % conn.execute('UPDATE entry ' 'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) ' 'WHERE refid IS NULL AND id IS NULL AND EXISTS ' '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)').rowcount) # assign new ids to hash groups of separated/new entries nextid, = conn.execute('SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone() cursor = conn.execute('SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash') print('%d new ids (new/separated)' % conn.executemany('UPDATE entry SET id = ? WHERE hash = ?', ((id, hash) for id, (hash,) in enumerate(cursor, nextid))).rowcount) assert allid(conn) assert onetoone(conn) # supersede relation superseded, = conn.execute('SELECT count(*) FROM entry WHERE id != srefid').fetchone() print('%d supersede pairs' % superseded)
def assign_ids(conn, verbose=False): merged_entry, entrygrp = Database._merged_entry, Database._entrygrp allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry ' 'WHERE hash IS NULL)').fetchone() assert allhash print('%d entries' % conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount) # resolve splits: srefid = refid only for entries from the most similar hash group nsplit = 0 cursor = conn.execute( 'SELECT refid, hash, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): old = merged_entry(entrygrp(conn, refid), raw=True) nsplit += len(group) cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda (hs, fields): distance(old, fields))[0] separated = conn.execute( 'UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?', (refid, new)).rowcount if verbose: for row in group: print(row) for ri, hs, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % new) print('%d: %d separated from %s\n' % (refid, separated, new)) print('%d splitted' % nsplit) nosplits, = conn.execute( 'SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))' ).fetchone() assert nosplits # resolve merges: id = srefid of the most similar srefid group nmerge = 0 cursor = conn.execute( 'SELECT hash, srefid, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) ' 'ORDER BY hash, srefid DESC, filename, bibkey') for hash, group in group_first(cursor): new = merged_entry(entrygrp(conn, hash), raw=True) nmerge += len(group) cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda (ri, fields): distance(new, fields))[0] merged = conn.execute( 'UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?', (old, hash, old)).rowcount if verbose: for row in group: print(row) for hs, ri, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % old) print('%s: %d merged into %d\n' % (hash, merged, old)) print('%d merged' % nmerge) # unchanged entries print('%d unchanged' % conn.execute('UPDATE entry SET id = srefid ' 'WHERE id IS NULL AND srefid IS NOT NULL').rowcount) nomerges, = conn.execute( 'SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))' ).fetchone() assert nomerges # identified print('%d identified (new/separated)' % conn.execute( 'UPDATE entry ' 'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) ' 'WHERE refid IS NULL AND id IS NULL AND EXISTS ' '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)' ).rowcount) # assign new ids to hash groups of separated/new entries nextid, = conn.execute( 'SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone() cursor = conn.execute( 'SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash') print('%d new ids (new/separated)' % conn.executemany('UPDATE entry SET id = ? WHERE hash = ?', ( (id, hash) for id, (hash, ) in enumerate(cursor, nextid))).rowcount) assert allid(conn) assert onetoone(conn) # supersede relation superseded, = conn.execute( 'SELECT count(*) FROM entry WHERE id != srefid').fetchone() print('%d supersede pairs' % superseded)
def test_unique(self): from pyglottolog.util import unique l = [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6] self.assertEqual(len(set(l)), len(list(unique(l))))
def test_unique(): assert list(util.unique([1, 2, 1, 2, 3])) == [1, 2, 3]