def report(fn, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX')) special = ('', '- no crossref found!', '- illegal crossref') # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} {}'.format(statuses[r], fn, special[r])) return r
def checkreport(m, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(m, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], o.filename)) return r
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w', encoding='utf-8') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=lastSlash(fn)[:-5].replace('-', ' '), year=findYear(lastSlash(fn))\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=fn.split('/')[-1][:-5].replace('-', ' '), year=findYear(fn.split('/')[-1])\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('WARN')) r, msg = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}: {}'.format(statuses[r], fn, msg)) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if isinstance(o, int): r = o else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkreport(fn, o, br): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if br: r = checkbrand(fn, br) else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r', encoding='utf-8') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2
title = tagdef['namefull'] if 'namefull' in tagdef.keys() else tagdef['name'] subt = ('<br/><em>'+tagdef['namelong']+'</em>') if 'namelong' in tagdef.keys() else '' links = '<strong>{}</strong>{}<hr/>'.format(title, subt) + '\n'.join(sorted(links)) dl = '<dl class="toc">' + '\n'.join(lst) + '</dl>' # hack to get from tags to papers dl = dl.replace('href="', 'href="../') f.write(tagHTML.format(\ title=key+' tag', etag=escape(key), tag=key, above='', boxlinks=links, listname='{} papers'.format(len(lst)), dl=dl)) f.close() print('Tag pages:', C.yellow('{}'.format(len(ts))), C.blue('generated')) # tag index f = open(outputdir+'/tag/index.html', 'w') keyz = [q for q in ts.keys() if len(ts[q]) > 2] keyz.sort(key=lambda t: len(ts[t]), reverse=True) lst = ['<li>#<a href="{}.html">{}</a> ({})</li>'.format(escape(t), t, len(ts[t])) for t in keyz] ul = '<ul class="tri mul">' + '\n'.join(lst) + '</ul>' CX = sum([len(ts[t]) for t in ts.keys()]) f.write(taglistHTML.format(\ title='All known tags', listname='{} tags known from {} markings'.format(len(ts), CX), ul=ul)) f.close() print('Tag index:', C.blue('created')) # untagged papers f = open(outputdir+'/tag/untagged.html', 'w')
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # "url" from DBLP are useless if 'url' in o.json.keys(): o.json['url'] = [link.replace('https://', 'http://')\ for link in listify(o.json['url'])\ if not link.startswith('db/conf/')\ and not link.startswith('db/series/')\ and not link.startswith('db/books/')\ and not link.startswith('db/journals/')] if not o.json['url']: del o.json['url'] elif len(o.json['url']) == 1: o.json['url'] = o.json['url'][0] if 'ee' in o.json.keys() and 'doi' not in o.json.keys(): if isinstance(o.json['ee'], list): if verbose: print(C.red('Manylink:'), o.json['ee']) newee = [] for onelink in listify(o.json['ee']): if onelink.startswith('http://dx.doi.org/'): o.json['doi'] = onelink[18:] elif onelink.startswith('http://doi.acm.org/'): o.json['doi'] = onelink[19:] elif onelink.startswith('http://doi.ieeecomputersociety.org/'): o.json['doi'] = onelink[35:] elif onelink.startswith('http://dl.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[34:] elif onelink.startswith('http://portal.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[38:] elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\ or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='): o.json['ieeearid'] = onelink.split('=')[-1] elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\ and onelink.find('arnumber') > -1: o.json['ieeearid'] = onelink.split('arnumber=')[-1].split('&')[0] elif onelink.startswith('http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber='): o.json['ieeepuid'] = onelink.split('=')[-1] elif onelink.startswith('http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='): o.json['ieeeisid'] = onelink.split('=')[-1] elif onelink.startswith('http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/'): newee.append('http://journal.ub.tu-berlin.de/eceasst/article/view/' + onelink.split('/')[-1]) elif onelink.endswith('.pdf') and \ (onelink.startswith('http://computer.org/proceedings/')\ or onelink.startswith('http://csdl.computer.org/')): # Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf # Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf # Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf if onelink.startswith('http://csdl'): cname, _, cid, mid, pid = onelink.split('/')[5:10] else: cname, cid, pid = onelink.split('/')[4:7] # heuristic if pid.startswith(cid): mid = pid[len(cid):len(cid)+2] else: mid = '00' newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\ cname, o.get('year'), cid, mid, pid)) else: if onelink.find('ieee') > -1: print(C.purple('IEEE'), onelink) if verbose: print(C.yellow('Missed opportunity:'), onelink) # nothing matches => preserve newee.append(onelink) if len(newee) == 0: del o.json['ee'] elif len(newee) == 1: o.json['ee'] = newee[0] else: o.json['ee'] = newee # post-processing normalisation if 'acmid' in o.json.keys() and not isinstance(o.json['acmid'], int) and o.json['acmid'].isdigit(): o.json['acmid'] = int(o.json['acmid']) if 'eventuri' in o.json.keys(): o.json['eventurl'] = o.json['eventuri'] del o.json['eventuri'] if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith('https://'): o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://') nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], s)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' # All known contributors cx = {0: 0, 1: 0, 2: 0} people = {} for fn in glob.glob(ienputdir + '/people/*.json'): p = parseJSON(fn) if p['name'] in people.keys(): cx[report(C.red('duplicate')+' '+C.yellow(p), 1)] += 1 continue people[p['name']] = p print('{}: {} venues, {} papers written by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(people)), C.purple('='*42))) # traverse ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: if 'author' in p.json.keys(): for a in listify(p.json['author']): if a in people.keys():
def report(one, two): print('[ {} ] {}'.format(one, two)) def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
paperAuths = paperAuths[:-1] paperAuths.extend(auths) paperLnk = li.get('id') hope = li.find_all('a') if hope and hope[0].get('href').endswith('.pdf'): paperPdf = urlstart + hope[0].get('href') else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = outputdir.split('/')[-1] + '-' + paperAuths[0].split(' ')[-1] for a in paperAuths[1:]: paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr(ord(paperFilename[-1])+1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir+'/'+paperFilename+'.json', 'w') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = lastSlash(outputdir) + '-' + paperAuths[0].split( ' ')[-1] for a in paperAuths[1:]: print(a) paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr( ord(paperFilename[-1]) + 1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir + '/' + paperFilename + '.json', 'w', encoding='utf-8') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
# allstems += x.getBareStems() # siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)} # NB: the following code is faster: siblings = Counter() for x in stems[k]: siblings.update([s for s in x.getBareStems() if s != k and ifApproved(s)]) box = '<code>Used together with:</code><hr/>' + \ '\n<br/>'.join(['<span class="tag"><a href="{0}.html">{0}</a></span> ({1})'.format(\ *sn) for sn in siblings.most_common(5)]) f.write(wordHTML.format(\ stem=k, inthebox=box, listname='{} papers'.format(len(lst)), dl='<dl class="toc">' + '\n'.join(lst).replace('href="', 'href="../') + '</dl>')) f.close() print('Word pages:', C.yellow('{}'.format(len(stems))), C.blue('generated')) # stem index f = open(outputdir+'/words.html', 'w', encoding='utf-8') keyz = [k for k in stems.keys() if len(stems[k]) > 100 and ifApproved(k)] keyz.sort(key=lambda t: -len(t), reverse=True) lst = ['<li><a href="word/{}.html">{}</a>$ ({})</li>'.format(\ escape(t), t, len(stems[t])) for t in keyz] ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>' CX = sum([len(stems[t]) for t in stems.keys()]) f.write(wordlistHTML.format(\ title='All known stems', listname='{} stems known and {} shown from {} notable words'.format(len(stems), len(keyz), CX), ul=ul)) f.close() print('Stem index:', C.blue('created')) print('{}\nDone with {} venues, {} papers, {} tags.'.format(\
statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], simpleLatin(s))) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' # All known contributors cx = {0: 0, 1: 0, 2: 0} people = {} for fn in glob.glob(ienputdir + '/people/*.json'): p = parseJSON(fn) if p['name'] in people.keys(): cx[report(C.red('duplicate') + ' ' + C.yellow(p), 1)] += 1 continue people[p['name']] = p print('{}: {} venues, {} papers written by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(people)), C.purple('='*42))) # traverse ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: if 'author' in p.json.keys(): for a in listify(p.json['author']): if a in people.keys():
C.purple('='*42))) bundles = {} for b in glob.glob(ienputdir + '/bundles/*.json'): purename = b.split('/')[-1][:-5] bun = json.load(open(b, 'r')) prevcx = pcx uberlist = '<h2>{1} papers</h2>{0}'.format(processSortedRel(bun['contents']), pcx-prevcx) f = open(outputdir + '/bundle/' + purename + '.html', 'w') f.write(bunHTML.format(\ title=purename+' bundle', bundle=bun['name'], ebundle=escape(purename), dl=uberlist.replace('href="', 'href="../').replace('../mailto', 'mailto'))) f.close() bundles[purename] = pcx-prevcx print('Bundle pages:', C.yellow('{}'.format(len(bundles))), C.blue('generated')) # now for the index f = open(outputdir+'/bundle/index.html', 'w') lst = ['<li><a href="{}.html">{}</a> ({})</li>'.format(\ escape(b), b, bundles[b]) for b in sorted(bundles.keys())] ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>' f.write(bunListHTML.format(\ title='All specified bundles', listname='{} bundles known with {} papers'.format(len(bundles), sum(bundles.values())), ul='<ul class="tri">' + '\n'.join(lst) + '</ul>')) f.close() print('Bundle index:', C.blue('created')) print('{}\nDone with {} venues, {} papers.'.format(\ C.purple('='*42),
+ ' \n'.join(['<span class="tag"><a href="../word/{0}.html">{0}</a></span> ({1})'.format(S, stems[S]) \ for S in stemkeys[:10]]) boxlinks += adds # combine boxlinks if boxlinks: boxlinks = '<div class="tbox">' + boxlinks + '</div>' f.write(personHTML.format(\ title=k, gender=gender, boxlinks=boxlinks, eperson=escape(k), person=persondef['name'], # boxlinks=links namedlists=dls)) f.close() print('Person pages:', C.yellow('{}'.format(len(ps))), C.blue('generated')) # person index # keyz = [k for k in ps.keys() if len(ts[k]) > 2] # keyz = sorted(keyz, key=lambda t:len(ts[t]), reverse=True) keyz = ps#sorted(ps.keys()) letters = [chr(x) for x in range(ord('a'), ord('z')+1)] indices = {x:[] for x in letters} for t in keyz: ws = t.split('_') i = -1 if ws[i] == 'Jr': i -= 1 letter = ws[i][0].lower() if not letter.isalpha(): print(C.red('ERROR')+':', 'wrong name', t) letter = ws[i-1][0].lower()
# allstems += x.getBareStems() # siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)} # NB: the following code is faster: siblings = Counter() for x in stems[k]: siblings.update([s for s in x.getBareStems() if s != k and ifApproved(s)]) box = '<code>Used together with:</code><hr/>' + \ '\n<br/>'.join(['<span class="tag"><a href="{0}.html">{0}</a></span> ({1})'.format(\ *sn) for sn in siblings.most_common(5)]) f.write(wordHTML.format(\ stem=k, inthebox=box, listname='{} papers'.format(len(lst)), dl='<dl class="toc">' + '\n'.join(lst).replace('href="', 'href="../') + '</dl>')) f.close() print('Word pages:', C.yellow('{}'.format(len(stems))), C.blue('generated')) # stem index f = open(outputdir+'/words.html', 'w') keyz = [k for k in stems.keys() if len(stems[k]) > 100 and ifApproved(k)] keyz.sort(key=lambda t: -len(t), reverse=True) lst = ['<li><a href="word/{}.html">{}</a>$ ({})</li>'.format(\ escape(t), t, len(stems[t])) for t in keyz] ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>' CX = sum([len(stems[t]) for t in stems.keys()]) f.write(wordlistHTML.format(\ title='All known stems', listname='{} stems known and {} shown from {} notable words'.format(len(stems), len(keyz), CX), ul=ul)) f.close() print('Stem index:', C.blue('created')) print('{}\nDone with {} venues, {} papers, {} tags.'.format(\
C.red(sleigh.numOfPapers()), C.purple('='*42))) # read the CSV f = open('scrap-committees/scraped-by-grammarware.csv', 'r', encoding='utf-8') # CBSE;2001;Heinz;Schmidt;;Organising Committee for line in f.readlines(): vs = line.strip().split(';') if len(vs) == 0: continue v = vs[0] + '-' + vs[1] n = vs[2] + ' ' + vs[3] # normalise! if n in renameto.keys(): print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n]) n = renameto[n] # sex is ignored, mostly absent anyway r = vs[5] if v not in roles.keys(): roles[v] = [] # NB: the next line uses lists for the sake of JSON compatibility roles[v].append([n, r]) f.close() print('Metadata on {} editions loaded with {} role assignments'.format( C.red(len(roles)), C.red(sum([len(roles[k]) for k in roles.keys()])))) # now add cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1
def report(fn1, fn2, r): statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2)) return r
report( '{}: “{}” == “{}”?'.format(surname, variants[0], variants[1]), 2) # print pvariants = ['“{}”'.format(v) for v in variants] report('{}: {}'.format(surname, ' vs '.join(pvariants)), 0) # write back if changed for k in people.keys(): p = people[k] if p['FILE']: if os.path.exists(p['FILE']): cur = parseJSON(p['FILE']) if cur == p: cx[0] += 1 if verbose: print('[', C.green('FIXD'), ']', p['name']) continue print('[', C.yellow('FIXD'), ']', p['name']) cx[2] += 1 f = open(p['FILE'], 'w', encoding='utf-8') del p['FILE'] f.write(jsonify(p)) f.close() else: print('How can that be?') print('{} people checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], s)) return r
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], simpleLatin(s))) return r
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country)-1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town)-1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town)-1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town)-1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state)-1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state=='?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1
'</em>') if 'namelong' in tagdef.keys() else '' links = '<strong>{}</strong>{}<hr/>'.format(title, subt) + '\n'.join( sorted(links)) dl = '<dl class="toc">' + '\n'.join(lst) + '</dl>' # hack to get from tags to papers dl = dl.replace('href="', 'href="../') f.write(tagHTML.format(\ title=key+' tag', etag=escape(key), tag=key, above='', boxlinks=links, listname='{} papers'.format(len(lst)), dl=dl)) f.close() print('Tag pages:', C.yellow('{}'.format(len(ts))), C.blue('generated')) # tag index f = open(outputdir + '/tag/index.html', 'w', encoding='utf-8') keyz = [q for q in ts.keys() if len(ts[q]) > 2] keyz.sort(key=lambda t: len(ts[t]), reverse=True) lst = [ '<li>#<a href="{}.html">{}</a> ({})</li>'.format( escape(t), t, len(ts[t])) for t in keyz ] ul = '<ul class="tri mul">' + '\n'.join(lst) + '</ul>' CX = sum([len(ts[t]) for t in ts.keys()]) f.write(taglistHTML.format(\ title='All known tags', listname='{} tags known from {} markings'.format(len(ts), CX), ul=ul)) f.close()
def main(): print('{}: {} venues, {} papers\n{}'.format(C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('=' * 42))) # generate the index f = open(outputdir + '/index.html', 'w', encoding='utf-8') f.write(sleigh.getPage()) f.close() # generate all individual pages # if False: for v in sleigh.venues: r = C.blue(v.getKey()) f = open(outputdir + '/' + v.getKey() + '.html', 'w', encoding='utf-8') f.write(v.getPage()) f.close() if v.brands: r += '{' + '+'.join([C.blue(b.getKey()) for b in v.brands]) + '}' for b in v.brands: f = open(outputdir + '/' + b.getKey() + '.brand.html', 'w', encoding='utf-8') f.write(b.getPage()) f.close() r += ' => ' for c in v.getConfs(): f = open(outputdir + '/' + c.getKey() + '.html', 'w', encoding='utf-8') f.write(c.getPage()) f.close() for p in c.papers: f = open(outputdir + '/' + p.getKey() + '.html', 'w', encoding='utf-8') f.write(p.getPage()) f.close() purekey = c.getKey().replace(v.getKey(), '').replace('-', ' ').strip() r += '{} [{}], '.format(purekey, C.yellow(len(c.papers))) print(r) # generate the icon lineup icons = [] linked = [] pngs = [ lastSlash(png).split('.')[0] for png in glob.glob(outputdir + '/stuff/*.png') ] pngs = [png for png in pngs \ if not (png.startswith('a-') or png.startswith('p-') or png.startswith('ico-') or png in ('cc-by', 'xhtml', 'css', 'open-knowledge', 'edit'))] for brand in glob.glob(outputdir + '/*.brand.html'): pure = lastSlash(brand).split('.')[0] img = pure.lower().replace(' ', '') if img in pngs: pic = '<div class="wider"><a href="{0}.brand.html"><img class="abc" src="{1}" alt="{0}"/></a><span>{0}</span></div>'.format( \ pure, 'stuff/' + img + '.png') pngs.remove(img) icons.append(pic) else: # print('No image for', pure) pass corner = { 'ada': 'TRI-Ada', 'comparch': 'CompArch', 'floc': 'FLoC', 'bibsleigh': 'index' } for pure in pngs: venueCandidate = corner[pure] if pure in corner else pure.upper() canlink = sorted(glob.glob(outputdir + '/' + venueCandidate + '*.html'), key=len) if canlink: pic = '<div class="wider"><a href="{0}"><img class="abc" src="stuff/{1}.png" alt="{2}"/></a><span>{2}</span></div>'.format( \ canlink[0].split('/')[-1], pure, venueCandidate, canlink[0].split('/')[0]) elif pure == 'twitter': pic = '<div class="wider"><a href="https://about.twitter.com/company/brand-assets"><img class="abc" src="stuff/twitter.png" alt="Twitter"/></a><span>Twitter</span></div>' elif pure == 'email': pic = '<div class="wider"><a href="mailto:[email protected]"><img class="abc" src="stuff/email.png" alt="e-mail"/></a><span>email</span></div>' else: print('Lonely', pure) pic = '<img class="abc" src="stuff/{0}.png" alt="{0}"/>'.format( pure) icons.append(pic) # find last year of each venue # for ven in glob.glob(corpusdir + '/*'): # venname = lastSlash(ven) # newstuff += '<strong><a href="http://dblp.uni-trier.de/db/conf/{}/">{} {}</a></strong>, '.format(venname.lower(), venname, nextYear(ven)) # print(lastSlash(ven), ':', lastYear(ven)) # write "more info" file f = open(outputdir + '/about.html', 'w', encoding='utf-8') f.write( aboutHTML.format( len(icons), '<div class="minibar">' + '\n'.join(sorted(icons)) + '</div>')) f.close() # generate the DBLP sync page cell_by_conf_by_year = {} Ys = [ 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009 ] dblplinks = {} with open(ienputdir + '/meta/dblpguide.sync', 'r') as f: for line in f: if not line or line.startswith('#'): continue words = line.split('|') if len(words) != 3: print('- Metaline {} skipped!'.format(words)) continue name = words[0].strip() dome = words[1].strip() dblp = words[2].strip() cell_by_conf_by_year[name] = {} dblplinks[name] = dblp for y in Ys: cell_by_conf_by_year[name][y] = '(no)' v = sleigh.getVenue(dome) if v: for yy in Ys: y = v.getYear(yy) if y: ckey = '{}-{}'.format(name, yy) c = y.getConf(ckey) if c: cell_by_conf_by_year[name][yy] = c.getIconItem2( '', '') else: # print('- Conference {} of year {} in venue {} not found in the corpus'.format(ckey, yy, name)) for alt in 'v1', 'p1', 'c1', '1', 'J': ckey = '{}-{}-{}'.format(name, alt, yy) c = y.getConf(ckey) if c: cell_by_conf_by_year[name][ yy] = c.getIconItem2('', '') break # else: # print('- Year {} in venue {} not found in the corpus among {}'.format(yy, name, [z.year for z in v.years])) # else: # print('- Venue {} not found in the corpus'.format(name)) table = '<table>' table += '<tr><td></td>' for y in Ys: table += '<th>{}</th>\n'.format(y) table += '</tr>' # print (cell_by_conf_by_year) for name in sorted(cell_by_conf_by_year.keys()): table += '<tr><th><a href="{}.brand.html">[@]</a> <a href="{}">{}</a></th>'.format( name, dblplinks[name], name) for y in Ys: table += '<td>{}</td>\n'.format(cell_by_conf_by_year[name][y]) table += '</tr>' table += '</table>' with open(outputdir + '/sync.html', 'w', encoding='utf-8') as f: f.write(syncHTML.format(table)) print('{}\nDone with {} venues, {} papers.'.format( C.purple('=' * 42), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers())))
C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) # read the CSV f = open('scrap-committees/scraped-by-grammarware.csv', 'r') # CBSE;2001;Heinz;Schmidt;;Organising Committee for line in f.readlines(): vs = line.strip().split(';') if len(vs) == 0: continue v = vs[0] + '-' + vs[1] n = vs[2] + ' ' + vs[3] # normalise! if n in renameto.keys(): print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n]) n = renameto[n] # sex is ignored, mostly absent anyway r = vs[5] if v not in roles.keys(): roles[v] = [] # NB: the next line uses lists for the sake of JSON compatibility roles[v].append([n,r]) f.close() print('Metadata on {} editions loaded with {} role assignments'.format(C.red(len(roles)), C.red(sum([len(roles[k]) for k in roles.keys()])))) # now add cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for p in c.papers:
+ ' \n'.join(['<span class="tag"><a href="../word/{0}.html">{0}</a></span> ({1})'.format(S, stems[S]) \ for S in stemkeys[:10]]) boxlinks += adds # combine boxlinks if boxlinks: boxlinks = '<div class="tbox">' + boxlinks + '</div>' f.write(personHTML.format(\ title=k, gender=gender, boxlinks=boxlinks, eperson=escape(k), person=persondef['name'], # boxlinks=links namedlists=dls)) f.close() print('Person pages:', C.yellow('{}'.format(len(ps))), C.blue('generated')) # person index # keyz = [k for k in ps.keys() if len(ts[k]) > 2] # keyz = sorted(keyz, key=lambda t:len(ts[t]), reverse=True) keyz = ps #sorted(ps.keys()) letters = [chr(x) for x in range(ord('a'), ord('z') + 1)] indices = {x: [] for x in letters} for t in keyz: ws = t.split('_') i = -1 if ws[i] == 'Jr': i -= 1 letter = ws[i][0].lower() if not letter.isalpha(): print(C.red('ERROR') + ':', 'wrong name', t) letter = ws[i - 1][0].lower()
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # bad variants for bad in unfoldName: for key in wheretolook: if o.get(key) == bad: o.json[key] = unfoldName[bad] # contractions for short in short2long: for key in wheretolook: if o.get(key) == short: o.json[key] = short2long[short] if o.get(key) == short2long[short]: o.json[key+'short'] = short # a heuristic contraction for conference names if o.get('type') == 'inproceedings' \ and 'booktitleshort' not in o.json.keys() \ and 'booktitle' in o.up().json.keys() \ and len(o.get('booktitle')) > len(o.up().get('booktitle')): o.json['booktitleshort'] = o.up().get('booktitle') # a heuristic expansion of conference names # if o.get('type') == 'proceedings' \ # and 'booktitleshort' not in o.json.keys() \ # and 'booktitle' in o.up().json.keys() \ # and len(o.get('booktitle')) > len(o.up().get('booktitle')): # o.json['booktitleshort'] = o.up().get('booktitle') # remove faulty series: journal wins if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'): del o.json['series'] # *short legacy while no longer version present for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]: del o.json[key] # Springer name change if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys(): if int(o.get('year')) < 2002: o.json['publisher'] = 'Springer-Verlag' o.json['publishershort'] = 'Springer' else: o.json['publisher'] = 'Springer International Publishing' o.json['publishershort'] = 'Springer' for key in wheretolook: if key not in o.json: continue val = o.get(key) # ends with a dot if val.endswith('.'): o.json[key] = o.json[key][:-1] continue # suspiciousness if val.find('.') > -1: problem = True for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \ ' Inc. ', 'WG2.8'): if val.find(ok) > -1: problem = False break if problem: report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”') # superfluousness if key+'short' in o.json.keys() and val == o.get(key+'short'): del o.json[key+'short'] nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # bad variants for bad in unfoldName: for key in wheretolook: if o.get(key) == bad: o.json[key] = unfoldName[bad] # contractions for short in short2long: for key in wheretolook: if o.get(key) == short: o.json[key] = short2long[short] if o.get(key) == short2long[short]: o.json[key+'short'] = short # a heuristic contraction for conference names if o.get('type') == 'inproceedings' \ and 'booktitleshort' not in o.json.keys() \ and 'booktitle' in o.up().json.keys() \ and len(o.get('booktitle')) > len(o.up().get('booktitle')): o.json['booktitleshort'] = o.up().get('booktitle') # a heuristic expansion of conference names # if o.get('type') == 'proceedings' \ # and 'booktitleshort' not in o.json.keys() \ # and 'booktitle' in o.up().json.keys() \ # and len(o.get('booktitle')) > len(o.up().get('booktitle')): # o.json['booktitleshort'] = o.up().get('booktitle') # remove faulty series: journal wins if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'): del o.json['series'] # *short legacy while no longer version present for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]: del o.json[key] # Springer name change if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys(): if int(o.get('year')) < 2002: o.json['publisher'] = 'Springer-Verlag' o.json['publishershort'] = 'Springer' else: o.json['publisher'] = 'Springer International Publishing' o.json['publishershort'] = 'Springer' for key in wheretolook: if key not in o.json: continue val = o.get(key) # ends with a dot if val.endswith('.'): o.json[key] = o.json[key][:-1] continue # suspiciousness if val.find('.') > -1: problem = True for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \ ' Inc. ', 'WG2.8'): if val.find(ok) > -1: problem = False break if problem: report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”') # superfluousness if key+'short' in o.json.keys() and val == o.get(key+'short'): del o.json[key+'short'] nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # "url" from DBLP are useless if 'url' in o.json.keys(): o.json['url'] = [link.replace('https://', 'http://')\ for link in listify(o.json['url'])\ if not link.startswith('db/conf/')\ and not link.startswith('db/series/')\ and not link.startswith('db/books/')\ and not link.startswith('db/journals/')] if not o.json['url']: del o.json['url'] elif len(o.json['url']) == 1: o.json['url'] = o.json['url'][0] if 'ee' in o.json.keys() and 'doi' not in o.json.keys(): if isinstance(o.json['ee'], list): if verbose: print(C.red('Manylink:'), o.json['ee']) newee = [] for onelink in listify(o.json['ee']): if onelink.startswith('http://dx.doi.org/'): o.json['doi'] = onelink[18:] elif onelink.startswith('http://doi.acm.org/'): o.json['doi'] = onelink[19:] elif onelink.startswith('http://doi.ieeecomputersociety.org/'): o.json['doi'] = onelink[35:] elif onelink.startswith('http://dl.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[34:] elif onelink.startswith('http://portal.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[38:] elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\ or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='): o.json['ieeearid'] = onelink.split('=')[-1] elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\ and onelink.find('arnumber') > -1: o.json['ieeearid'] = onelink.split('arnumber=')[-1].split( '&')[0] elif onelink.startswith( 'http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=' ): o.json['ieeepuid'] = onelink.split('=')[-1] elif onelink.startswith( 'http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='): o.json['ieeeisid'] = onelink.split('=')[-1] elif onelink.startswith( 'http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/' ): newee.append( 'http://journal.ub.tu-berlin.de/eceasst/article/view/' + onelink.split('/')[-1]) elif onelink.endswith('.pdf') and \ (onelink.startswith('http://computer.org/proceedings/')\ or onelink.startswith('http://csdl.computer.org/')): # Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf # Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf # Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf if onelink.startswith('http://csdl'): cname, _, cid, mid, pid = onelink.split('/')[5:10] else: cname, cid, pid = onelink.split('/')[4:7] # heuristic if pid.startswith(cid): mid = pid[len(cid):len(cid) + 2] else: mid = '00' newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\ cname, o.get('year'), cid, mid, pid)) else: if onelink.find('ieee') > -1: print(C.purple('IEEE'), onelink) if verbose: print(C.yellow('Missed opportunity:'), onelink) # nothing matches => preserve newee.append(onelink) if len(newee) == 0: del o.json['ee'] elif len(newee) == 1: o.json['ee'] = newee[0] else: o.json['ee'] = newee # post-processing normalisation if 'acmid' in o.json.keys() and not isinstance( o.json['acmid'], int) and o.json['acmid'].isdigit(): o.json['acmid'] = int(o.json['acmid']) if 'eventuri' in o.json.keys(): o.json['eventurl'] = o.json['eventuri'] del o.json['eventuri'] if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith( 'https://'): o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://') nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
# flatten conferences for easy lookup knownConfs = [] for v in sleigh.venues: for c in v.getConfs(): knownConfs.append(c.getKey()) # print(knownConfs) print(C.purple('BibSLEIGH flattened to {} entities'.format(len(knownConfs)))) # compressed error output dunno = [] # Conference;Year;First Name;Last Name;Sex;Role for line in csv: name = (line[2] + ' ' + line[3]).strip() if name in established.keys(): name = established[name] if name in renameto.keys(): print('[', C.yellow('ALIA'), ']', 'Treating', name, 'as', renameto[name]) established[name] = renameto[name] name = renameto[name] if name not in peoplekeys: # not really needed, but just for the sake of wider applicability in the future ndl = nomidnames(nodiaLatin(name)).lower() f = None for k in peoplekeys: if nomidnames(nodiaLatin(k)).lower() == ndl: f = k break if not f: if name not in dunno: print('[', C.red('PERS'), ']', 'Unacquainted with', name) dunno.append(name) continue
if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # stem ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p, None)] += 1 for b in v.getBrands(): cx[checkreport(b.filename, None, b)] += 1 # write all stems listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w) f = open(ienputdir + '/stems.json', 'w', encoding='utf-8') f.write('[\n\t"' + '",\n\t"'.join(listOfStems) + '"\n]') f.close() print(C.red(len(ALLSTEMS)), 'stems found.') print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country) - 1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town) - 1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town) - 1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town) - 1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state) - 1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state == '?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1