def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('WARN')) r, msg = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}: {}'.format(statuses[r], fn, msg)) return r
def report(fn, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX')) special = ('', '- no crossref found!', '- illegal crossref') # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} {}'.format(statuses[r], fn, special[r])) return r
def checkreport(m, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(m, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], o.filename)) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkreport(fn, o, br): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if br: r = checkbrand(fn, br) else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if isinstance(o, int): r = o else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r', encoding='utf-8') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2
else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = lastSlash(outputdir) + '-' + paperAuths[0].split( ' ')[-1] for a in paperAuths[1:]: print(a) paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr( ord(paperFilename[-1]) + 1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir + '/' + paperFilename + '.json', 'w', encoding='utf-8') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country)-1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town)-1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town)-1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town)-1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state)-1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state=='?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1
# allstems += x.getBareStems() # siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)} # NB: the following code is faster: siblings = Counter() for x in stems[k]: siblings.update([s for s in x.getBareStems() if s != k and ifApproved(s)]) box = '<code>Used together with:</code><hr/>' + \ '\n<br/>'.join(['<span class="tag"><a href="{0}.html">{0}</a></span> ({1})'.format(\ *sn) for sn in siblings.most_common(5)]) f.write(wordHTML.format(\ stem=k, inthebox=box, listname='{} papers'.format(len(lst)), dl='<dl class="toc">' + '\n'.join(lst).replace('href="', 'href="../') + '</dl>')) f.close() print('Word pages:', C.yellow('{}'.format(len(stems))), C.blue('generated')) # stem index f = open(outputdir+'/words.html', 'w', encoding='utf-8') keyz = [k for k in stems.keys() if len(stems[k]) > 100 and ifApproved(k)] keyz.sort(key=lambda t: -len(t), reverse=True) lst = ['<li><a href="word/{}.html">{}</a>$ ({})</li>'.format(\ escape(t), t, len(stems[t])) for t in keyz] ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>' CX = sum([len(stems[t]) for t in stems.keys()]) f.write(wordlistHTML.format(\ title='All known stems', listname='{} stems known and {} shown from {} notable words'.format(len(stems), len(keyz), CX), ul=ul)) f.close() print('Stem index:', C.blue('created')) print('{}\nDone with {} venues, {} papers, {} tags.'.format(\
# allstems += x.getBareStems() # siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)} # NB: the following code is faster: siblings = Counter() for x in stems[k]: siblings.update([s for s in x.getBareStems() if s != k and ifApproved(s)]) box = '<code>Used together with:</code><hr/>' + \ '\n<br/>'.join(['<span class="tag"><a href="{0}.html">{0}</a></span> ({1})'.format(\ *sn) for sn in siblings.most_common(5)]) f.write(wordHTML.format(\ stem=k, inthebox=box, listname='{} papers'.format(len(lst)), dl='<dl class="toc">' + '\n'.join(lst).replace('href="', 'href="../') + '</dl>')) f.close() print('Word pages:', C.yellow('{}'.format(len(stems))), C.blue('generated')) # stem index f = open(outputdir+'/words.html', 'w') keyz = [k for k in stems.keys() if len(stems[k]) > 100 and ifApproved(k)] keyz.sort(key=lambda t: -len(t), reverse=True) lst = ['<li><a href="word/{}.html">{}</a>$ ({})</li>'.format(\ escape(t), t, len(stems[t])) for t in keyz] ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>' CX = sum([len(stems[t]) for t in stems.keys()]) f.write(wordlistHTML.format(\ title='All known stems', listname='{} stems known and {} shown from {} notable words'.format(len(stems), len(keyz), CX), ul=ul)) f.close() print('Stem index:', C.blue('created')) print('{}\nDone with {} venues, {} papers, {} tags.'.format(\
cx[1] += 1 return dblpLatin(s)+':' ws = s.split(' ') i = -1 if ws[i] in ('Jr', 'Jr.'): i -= 1 sur = dblpLatin(' '.join(ws[i:])) rest = dblpLatin(' '.join(ws[:i])).replace(' ', '_') for c in ".'-": rest = rest.replace(c, '=') return sur+':'+rest if __name__ == "__main__": verbose = sys.argv[-1] == '-v' if not os.path.exists('_renameto.json'): print('Run', C.blue('refine-aliases.py'), 'to build the aliasing/renaming relation and cache it.') sys.exit(1) # aka = parseJSON(ienputdir + '/aliases.json') dis = parseJSON(ienputdir + '/disambig.json') renameto = parseJSON('_renameto.json') # Data from the conferenceMetrics repo csv = [] f = open('../conferenceMetrics/data/SE-conf-roles.csv', 'r') for line in f.readlines(): # Conference;Year;First Name;Last Name;Sex;Role csv.append(line.strip().split(';')) f.close() f = open('scrap-committees/scraped-by-grammarware.csv', 'r') for line in f.readlines(): csv.append(line.strip().split(';')) f.close()
return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.green(len(sleigh.venues)), C.green(sleigh.numOfPapers()), C.purple('='*42))) aka = parseJSON(ienputdir + '/aliases.json') CX = sum([len(aka[a]) for a in aka]) # self-adaptation heuristic: # if a manual rule does the same as the other heuristic, it’s dumb for a in sorted(aka.keys()): if len(aka[a]) == 1 and aka[a][0] in (nodiaLatin(a), simpleLatin(a)): print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing was unnecessary manual work') elif len(aka[a]) == 2 and (aka[a] == [nodiaLatin(a), simpleLatin(a)] \ or aka[a] == [simpleLatin(a), nodiaLatin(a)]): print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing was a lot of unnecessary manual work') elif nodiaLatin(a) in aka[a] or simpleLatin(a) in aka[a]: print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing contains some unnecessary manual work') # auto-aliasing heuristic: # for each author with diacritics, its non-diacritic twin is considered harmful people = set() for v in sleigh.venues: for c in v.getConfs(): if 'editor' in c.json: people.update(listify(c.json['editor'])) for p in c.papers: if 'author' in p.json: people.update(listify(p.json['author']))
title = tagdef['namefull'] if 'namefull' in tagdef.keys() else tagdef['name'] subt = ('<br/><em>'+tagdef['namelong']+'</em>') if 'namelong' in tagdef.keys() else '' links = '<strong>{}</strong>{}<hr/>'.format(title, subt) + '\n'.join(sorted(links)) dl = '<dl class="toc">' + '\n'.join(lst) + '</dl>' # hack to get from tags to papers dl = dl.replace('href="', 'href="../') f.write(tagHTML.format(\ title=key+' tag', etag=escape(key), tag=key, above='', boxlinks=links, listname='{} papers'.format(len(lst)), dl=dl)) f.close() print('Tag pages:', C.yellow('{}'.format(len(ts))), C.blue('generated')) # tag index f = open(outputdir+'/tag/index.html', 'w') keyz = [q for q in ts.keys() if len(ts[q]) > 2] keyz.sort(key=lambda t: len(ts[t]), reverse=True) lst = ['<li>#<a href="{}.html">{}</a> ({})</li>'.format(escape(t), t, len(ts[t])) for t in keyz] ul = '<ul class="tri mul">' + '\n'.join(lst) + '</ul>' CX = sum([len(ts[t]) for t in ts.keys()]) f.write(taglistHTML.format(\ title='All known tags', listname='{} tags known from {} markings'.format(len(ts), CX), ul=ul)) f.close() print('Tag index:', C.blue('created')) # untagged papers f = open(outputdir+'/tag/untagged.html', 'w')
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country) - 1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town) - 1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town) - 1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town) - 1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state) - 1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state == '?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1
def report(one, two): print('[ {} ] {}'.format(one, two)) def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
def main(): print('{}: {} venues, {} papers\n{}'.format(C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('=' * 42))) # generate the index f = open(outputdir + '/index.html', 'w', encoding='utf-8') f.write(sleigh.getPage()) f.close() # generate all individual pages # if False: for v in sleigh.venues: r = C.blue(v.getKey()) f = open(outputdir + '/' + v.getKey() + '.html', 'w', encoding='utf-8') f.write(v.getPage()) f.close() if v.brands: r += '{' + '+'.join([C.blue(b.getKey()) for b in v.brands]) + '}' for b in v.brands: f = open(outputdir + '/' + b.getKey() + '.brand.html', 'w', encoding='utf-8') f.write(b.getPage()) f.close() r += ' => ' for c in v.getConfs(): f = open(outputdir + '/' + c.getKey() + '.html', 'w', encoding='utf-8') f.write(c.getPage()) f.close() for p in c.papers: f = open(outputdir + '/' + p.getKey() + '.html', 'w', encoding='utf-8') f.write(p.getPage()) f.close() purekey = c.getKey().replace(v.getKey(), '').replace('-', ' ').strip() r += '{} [{}], '.format(purekey, C.yellow(len(c.papers))) print(r) # generate the icon lineup icons = [] linked = [] pngs = [ lastSlash(png).split('.')[0] for png in glob.glob(outputdir + '/stuff/*.png') ] pngs = [png for png in pngs \ if not (png.startswith('a-') or png.startswith('p-') or png.startswith('ico-') or png in ('cc-by', 'xhtml', 'css', 'open-knowledge', 'edit'))] for brand in glob.glob(outputdir + '/*.brand.html'): pure = lastSlash(brand).split('.')[0] img = pure.lower().replace(' ', '') if img in pngs: pic = '<div class="wider"><a href="{0}.brand.html"><img class="abc" src="{1}" alt="{0}"/></a><span>{0}</span></div>'.format( \ pure, 'stuff/' + img + '.png') pngs.remove(img) icons.append(pic) else: # print('No image for', pure) pass corner = { 'ada': 'TRI-Ada', 'comparch': 'CompArch', 'floc': 'FLoC', 'bibsleigh': 'index' } for pure in pngs: venueCandidate = corner[pure] if pure in corner else pure.upper() canlink = sorted(glob.glob(outputdir + '/' + venueCandidate + '*.html'), key=len) if canlink: pic = '<div class="wider"><a href="{0}"><img class="abc" src="stuff/{1}.png" alt="{2}"/></a><span>{2}</span></div>'.format( \ canlink[0].split('/')[-1], pure, venueCandidate, canlink[0].split('/')[0]) elif pure == 'twitter': pic = '<div class="wider"><a href="https://about.twitter.com/company/brand-assets"><img class="abc" src="stuff/twitter.png" alt="Twitter"/></a><span>Twitter</span></div>' elif pure == 'email': pic = '<div class="wider"><a href="mailto:[email protected]"><img class="abc" src="stuff/email.png" alt="e-mail"/></a><span>email</span></div>' else: print('Lonely', pure) pic = '<img class="abc" src="stuff/{0}.png" alt="{0}"/>'.format( pure) icons.append(pic) # find last year of each venue # for ven in glob.glob(corpusdir + '/*'): # venname = lastSlash(ven) # newstuff += '<strong><a href="http://dblp.uni-trier.de/db/conf/{}/">{} {}</a></strong>, '.format(venname.lower(), venname, nextYear(ven)) # print(lastSlash(ven), ':', lastYear(ven)) # write "more info" file f = open(outputdir + '/about.html', 'w', encoding='utf-8') f.write( aboutHTML.format( len(icons), '<div class="minibar">' + '\n'.join(sorted(icons)) + '</div>')) f.close() # generate the DBLP sync page cell_by_conf_by_year = {} Ys = [ 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009 ] dblplinks = {} with open(ienputdir + '/meta/dblpguide.sync', 'r') as f: for line in f: if not line or line.startswith('#'): continue words = line.split('|') if len(words) != 3: print('- Metaline {} skipped!'.format(words)) continue name = words[0].strip() dome = words[1].strip() dblp = words[2].strip() cell_by_conf_by_year[name] = {} dblplinks[name] = dblp for y in Ys: cell_by_conf_by_year[name][y] = '(no)' v = sleigh.getVenue(dome) if v: for yy in Ys: y = v.getYear(yy) if y: ckey = '{}-{}'.format(name, yy) c = y.getConf(ckey) if c: cell_by_conf_by_year[name][yy] = c.getIconItem2( '', '') else: # print('- Conference {} of year {} in venue {} not found in the corpus'.format(ckey, yy, name)) for alt in 'v1', 'p1', 'c1', '1', 'J': ckey = '{}-{}-{}'.format(name, alt, yy) c = y.getConf(ckey) if c: cell_by_conf_by_year[name][ yy] = c.getIconItem2('', '') break # else: # print('- Year {} in venue {} not found in the corpus among {}'.format(yy, name, [z.year for z in v.years])) # else: # print('- Venue {} not found in the corpus'.format(name)) table = '<table>' table += '<tr><td></td>' for y in Ys: table += '<th>{}</th>\n'.format(y) table += '</tr>' # print (cell_by_conf_by_year) for name in sorted(cell_by_conf_by_year.keys()): table += '<tr><th><a href="{}.brand.html">[@]</a> <a href="{}">{}</a></th>'.format( name, dblplinks[name], name) for y in Ys: table += '<td>{}</td>\n'.format(cell_by_conf_by_year[name][y]) table += '</tr>' table += '</table>' with open(outputdir + '/sync.html', 'w', encoding='utf-8') as f: f.write(syncHTML.format(table)) print('{}\nDone with {} venues, {} papers.'.format( C.purple('=' * 42), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers())))
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], simpleLatin(s))) return r
C.purple('='*42))) bundles = {} for b in glob.glob(ienputdir + '/bundles/*.json'): purename = b.split('/')[-1][:-5] bun = json.load(open(b, 'r')) prevcx = pcx uberlist = '<h2>{1} papers</h2>{0}'.format(processSortedRel(bun['contents']), pcx-prevcx) f = open(outputdir + '/bundle/' + purename + '.html', 'w') f.write(bunHTML.format(\ title=purename+' bundle', bundle=bun['name'], ebundle=escape(purename), dl=uberlist.replace('href="', 'href="../').replace('../mailto', 'mailto'))) f.close() bundles[purename] = pcx-prevcx print('Bundle pages:', C.yellow('{}'.format(len(bundles))), C.blue('generated')) # now for the index f = open(outputdir+'/bundle/index.html', 'w') lst = ['<li><a href="{}.html">{}</a> ({})</li>'.format(\ escape(b), b, bundles[b]) for b in sorted(bundles.keys())] ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>' f.write(bunListHTML.format(\ title='All specified bundles', listname='{} bundles known with {} papers'.format(len(bundles), sum(bundles.values())), ul='<ul class="tri">' + '\n'.join(lst) + '</ul>')) f.close() print('Bundle index:', C.blue('created')) print('{}\nDone with {} venues, {} papers.'.format(\ C.purple('='*42),
+ ' \n'.join(['<span class="tag"><a href="../word/{0}.html">{0}</a></span> ({1})'.format(S, stems[S]) \ for S in stemkeys[:10]]) boxlinks += adds # combine boxlinks if boxlinks: boxlinks = '<div class="tbox">' + boxlinks + '</div>' f.write(personHTML.format(\ title=k, gender=gender, boxlinks=boxlinks, eperson=escape(k), person=persondef['name'], # boxlinks=links namedlists=dls)) f.close() print('Person pages:', C.yellow('{}'.format(len(ps))), C.blue('generated')) # person index # keyz = [k for k in ps.keys() if len(ts[k]) > 2] # keyz = sorted(keyz, key=lambda t:len(ts[t]), reverse=True) keyz = ps#sorted(ps.keys()) letters = [chr(x) for x in range(ord('a'), ord('z')+1)] indices = {x:[] for x in letters} for t in keyz: ws = t.split('_') i = -1 if ws[i] == 'Jr': i -= 1 letter = ws[i][0].lower() if not letter.isalpha(): print(C.red('ERROR')+':', 'wrong name', t) letter = ws[i-1][0].lower()
def report(fn1, fn2, r): statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2)) return r
+ ' \n'.join(['<span class="tag"><a href="../word/{0}.html">{0}</a></span> ({1})'.format(S, stems[S]) \ for S in stemkeys[:10]]) boxlinks += adds # combine boxlinks if boxlinks: boxlinks = '<div class="tbox">' + boxlinks + '</div>' f.write(personHTML.format(\ title=k, gender=gender, boxlinks=boxlinks, eperson=escape(k), person=persondef['name'], # boxlinks=links namedlists=dls)) f.close() print('Person pages:', C.yellow('{}'.format(len(ps))), C.blue('generated')) # person index # keyz = [k for k in ps.keys() if len(ts[k]) > 2] # keyz = sorted(keyz, key=lambda t:len(ts[t]), reverse=True) keyz = ps #sorted(ps.keys()) letters = [chr(x) for x in range(ord('a'), ord('z') + 1)] indices = {x: [] for x in letters} for t in keyz: ws = t.split('_') i = -1 if ws[i] == 'Jr': i -= 1 letter = ws[i][0].lower() if not letter.isalpha(): print(C.red('ERROR') + ':', 'wrong name', t) letter = ws[i - 1][0].lower()
paperAuths = paperAuths[:-1] paperAuths.extend(auths) paperLnk = li.get('id') hope = li.find_all('a') if hope and hope[0].get('href').endswith('.pdf'): paperPdf = urlstart + hope[0].get('href') else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = outputdir.split('/')[-1] + '-' + paperAuths[0].split(' ')[-1] for a in paperAuths[1:]: paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr(ord(paperFilename[-1])+1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir+'/'+paperFilename+'.json', 'w') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
'</em>') if 'namelong' in tagdef.keys() else '' links = '<strong>{}</strong>{}<hr/>'.format(title, subt) + '\n'.join( sorted(links)) dl = '<dl class="toc">' + '\n'.join(lst) + '</dl>' # hack to get from tags to papers dl = dl.replace('href="', 'href="../') f.write(tagHTML.format(\ title=key+' tag', etag=escape(key), tag=key, above='', boxlinks=links, listname='{} papers'.format(len(lst)), dl=dl)) f.close() print('Tag pages:', C.yellow('{}'.format(len(ts))), C.blue('generated')) # tag index f = open(outputdir + '/tag/index.html', 'w', encoding='utf-8') keyz = [q for q in ts.keys() if len(ts[q]) > 2] keyz.sort(key=lambda t: len(ts[t]), reverse=True) lst = [ '<li>#<a href="{}.html">{}</a> ({})</li>'.format( escape(t), t, len(ts[t])) for t in keyz ] ul = '<ul class="tri mul">' + '\n'.join(lst) + '</ul>' CX = sum([len(ts[t]) for t in ts.keys()]) f.write(taglistHTML.format(\ title='All known tags', listname='{} tags known from {} markings'.format(len(ts), CX), ul=ul)) f.close()
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], s)) return r
if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # stem ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p, None)] += 1 for b in v.getBrands(): cx[checkreport(b.filename, None, b)] += 1 # write all stems listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w) f = open(ienputdir + '/stems.json', 'w', encoding='utf-8') f.write('[\n\t"' + '",\n\t"'.join(listOfStems) + '"\n]') f.close() print(C.red(len(ALLSTEMS)), 'stems found.') print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) aka = parseJSON(ienputdir + '/aliases.json') CX = sum([len(aka[a]) for a in aka]) # self-adaptation heuristic: # if a manual rule does the same as the other heuristic, it’s dumb for a in sorted(aka.keys()): if len(aka[a]) == 1 and aka[a][0] in (nodiaLatin(a), simpleLatin(a)): print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing was unnecessary manual work') elif len(aka[a]) == 2 and (aka[a] == [nodiaLatin(a), simpleLatin(a)] \ or aka[a] == [simpleLatin(a), nodiaLatin(a)]): print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing was a lot of unnecessary manual work') elif nodiaLatin(a) in aka[a] or simpleLatin(a) in aka[a]: print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing contains some unnecessary manual work') # auto-aliasing heuristic: # for each author with diacritics, its non-diacritic twin is considered harmful people = set() for v in sleigh.venues: for c in v.getConfs(): if 'editor' in c.json: people.update(listify(c.json['editor'])) for p in c.papers: if 'author' in p.json: people.update(listify(p.json['author']))