def __init__(self, idir, name2file): super(Sleigh, self).__init__('', idir) self.venues = [] self.n2f = name2file jsons = {} skip4Now = [] for d in glob.glob(idir+'/*.json'): if d.split('/')[-1].split('.')[0] in skip4Now: print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) continue jsons[d.split('/')[-1].split('.')[0]] = d for d in glob.glob(idir+'/*'): cont = False for end in ('.md', '.json', '/frem', '/edif'): if d.endswith(end): cont = True if d.split('/')[-1] in skip4Now: print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) cont = True if cont: continue if d.split('/')[-1] not in jsons.keys(): print(C.red('Legacy non-top definition of'), d) self.venues.append(Venue(d, idir, name2file, self)) else: self.venues.append(Venue(d, idir, name2file, self))
def __init__(self, idir, name2file): super(Sleigh, self).__init__('', idir) self.venues = [] self.n2f = name2file jsons = {} skip4Now = [] for d in glob.glob(idir + '/*.json'): if lastSlash(d).split('.')[0] in skip4Now: print( C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) continue jsons[lastSlash(d).split('.')[0]] = d for d in glob.glob(idir + '/*'): cont = False for end in ('.md', '.json', '/frem', '/edif'): if d.endswith(end): cont = True if d.split('/')[-1] in skip4Now: print( C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) cont = True if cont: continue if lastSlash(d) not in jsons.keys(): print(C.red('Legacy non-top definition of'), d) if lastSlash(d) not in ('edif', 'frem'): self.venues.append(Venue(d, idir, name2file, self)) else: self.venues.append(Venue(d, idir, name2file, self))
def __init__(self, d, hdir, name2file, parent): super(Venue, self).__init__(d, hdir) self.years = [] self.brands = [] self.n2f = name2file if os.path.exists(d + '.json'): # new style # print(C.blue(d), 'is new style') self.json = parseJSON(d + '.json') else: # legacy style print(C.red(d), 'is legacy style') self.json = {} for f in glob.glob(d + '/*.json'): if not self.json: self.json = parseJSON(f) else: self.brands.append(Brand(f, self.homedir, name2file, self)) for f in glob.glob(d + '/*'): if f.endswith('.json'): # already processed continue elif os.path.isdir(f): y = Year(f, self.homedir, name2file, self) self.years.append(y) for b in self.brands: for c in y.confs: b.offer(y.year, c) else: print('File out of place:', f) self.back = parent
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('WARN')) r, msg = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}: {}'.format(statuses[r], fn, msg)) return r
def report(fn, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX')) special = ('', '- no crossref found!', '- illegal crossref') # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} {}'.format(statuses[r], fn, special[r])) return r
def processSortedRel(r): # [ {"x" : Y } ] where Y can be a string or a sorted rel global pcx acc = [] for el in r: ename = list(el.keys())[0] evals = el[ename] if os.path.isfile(outputdir + '/stuff/' + ename.lower() + '.png'): img = '<img src="../stuff/{1}.png" alt="{0}" width="30px"/> '.format(ename, ename.lower()) else: img = '' if isinstance(evals, str): plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages) pcx += len(plst) ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>' elif isinstance(evals, list) and isinstance(evals[0], str): plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages) pcx += len(plst) ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>' elif isinstance(evals, list) and isinstance(evals[0], dict): ptxt = processSortedRel(evals) else: print(C.red('ERROR:'), 'unrecornised bundle structure', evals) acc.append('<dl><dt>{}{}</dt><dd>{}</dl>'.format(img, ename, ptxt)) return '\n'.join(acc)
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=fn.split('/')[-1][:-5].replace('-', ' '), year=findYear(fn.split('/')[-1])\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w', encoding='utf-8') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=lastSlash(fn)[:-5].replace('-', ' '), year=findYear(lastSlash(fn))\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkreport(m, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(m, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], o.filename)) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def __init__(self, d, hdir, name2file, parent): super(Venue, self).__init__(d, hdir) self.years = [] self.brands = [] self.n2f = name2file if os.path.exists(d+'.json'): # new style # print(C.blue(d), 'is new style') self.json = parseJSON(d+'.json') else: # legacy style print(C.red(d), 'is legacy style') self.json = [] for f in glob.glob(d+'/*.json'): if not self.json: self.json = parseJSON(f) else: self.brands.append(Brand(f, self.homedir, name2file, self)) for f in glob.glob(d+'/*'): if f.endswith('.json'): # already processed continue elif os.path.isdir(f): y = Year(f, self.homedir, name2file, self) self.years.append(y) for b in self.brands: for c in y.confs: b.offer(y.year, c) else: print('File out of place:', f) self.back = parent
def parseJSON(fn): # print('Parsing',fn,'...') try: j = json.load(open(fn, 'r', encoding='utf-8')) j['FILE'] = fn return j except ValueError: print(C.red('JSON parse error'), 'on', fn.replace('\\', '/')) return {}
def parseJSON(fn): # print('Parsing',fn,'...') try: j = json.load(open(fn, 'r')) j['FILE'] = fn return j except ValueError: print(C.red('JSON parse error'), 'on', fn) return {}
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if isinstance(o, int): r = o else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkreport(fn, o, br): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if br: r = checkbrand(fn, br) else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r', encoding='utf-8') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2
def guessYear(p): cys = [int(w) for w in p.split('-') if len(w) == 4 and w.isdigit()] if len(cys) == 1: return cys[0] else: j = sleigh.seekByKey(p) if 'year' in j.json.keys(): return j.get('year') elif 'year' in dir(j): return j.year else: print('[ {} ] {}'.format(C.red('YEAR'), p)) return 0
def guessYear(P): cys = [int(w) for w in P.split('-') if len(w) == 4 and w.isdigit()] if len(cys) == 1: return cys[0] else: j = sleigh.seekByKey(P) if 'year' in j.json.keys(): return j.get('year') elif 'year' in dir(j): return j.year else: print('[ {} ] {}'.format(C.red('YEAR'), P)) return 0
def sortbypages(z): if 'pages' not in z.json.keys(): print(C.red('No pages at all in ' + z.getKey())) return 0 p1, _ = z.getPagesTuple() y = z.get('year') if isinstance(y, str): # non-correcting robustness return 0 # a trick to have several volumes within one conference v = z.get('volume') if isinstance(v, int) or v.isdigit(): y += int(v) return y + p1 / 10000. if p1 else y
def sortbypages(z): if 'pages' not in z.json.keys(): print(C.red('No pages at all in '+z.getKey())) return 0 p1, _ = z.getPagesTuple() y = z.get('year') if isinstance(y, str): # non-correcting robustness return 0 # a trick to have several volumes within one conference v = z.get('volume') if isinstance(v, int) or v.isdigit(): y += int(v) return y + p1 / 10000. if p1 else y
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if 'title' not in o.json.keys(): if verbose: print('No title in', o.getKey()) return 1 # no title # check for a different language - to avoid stemming altogether if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags): if 'stemmed' in o.json.keys(): # if stemmed before marked foreign, remove this info del o.json['stemmed'] F = open(fn, 'w', encoding='utf-8') F.write(o.getJSON()) F.close() return 2 else: return 0 changed = False ### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles stemmer = snowballstemmer.stemmer('english').stemWords ### disregarded variant: snowballstemmer porter - considered outdated # stemmer = snowballstemmer.stemmer('porter').stemWords ### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles # stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs] ### disregarded variant: nltk - worse on verbs ending with -ze # stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs] ### end variants stemmed = stemmer(string2words(o.get('title'))) if '' in stemmed: print('“{}” is a title of {} and it has an empty word'.format( o.get('title'), C.red(o.getKey()))) print(string2words(o.get('title'))) print(stemmer(string2words(o.get('title')))) ALLSTEMS.update(stemmed) if o.get('stemmed') != stemmed: o.json['stemmed'] = stemmed changed = True if changed: F = open(fn, 'w', encoding='utf-8') F.write(o.getJSON()) F.close() return 2 else: return 0
def dblpify(s): # http://dblp.uni-trier.de/pers/hd/e/Elbaum:Sebastian_G= if s in dis.keys(): return dis[s] if s.find(' ') < 0: print('[', C.red('NAME'), ']', 'Unconventional full name:', s) cx[1] += 1 return dblpLatin(s)+':' ws = s.split(' ') i = -1 if ws[i] in ('Jr', 'Jr.'): i -= 1 sur = dblpLatin(' '.join(ws[i:])) rest = dblpLatin(' '.join(ws[:i])).replace(' ', '_') for c in ".'-": rest = rest.replace(c, '=') return sur+':'+rest
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if 'title' not in o.json.keys(): if verbose: print('No title in', o.getKey()) return 1 # no title # check for a different language - to avoid stemming altogether if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags): if 'stemmed' in o.json.keys(): # if stemmed before marked foreign, remove this info del o.json['stemmed'] F = open(fn, 'w') F.write(o.getJSON()) F.close() return 2 else: return 0 changed = False ### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles stemmer = snowballstemmer.stemmer('english').stemWords ### disregarded variant: snowballstemmer porter - considered outdated # stemmer = snowballstemmer.stemmer('porter').stemWords ### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles # stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs] ### disregarded variant: nltk - worse on verbs ending with -ze # stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs] ### end variants stemmed = stemmer(string2words(o.get('title'))) if '' in stemmed: print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey()))) print(string2words(o.get('title'))) print(stemmer(string2words(o.get('title')))) ALLSTEMS.update(stemmed) if o.get('stemmed') != stemmed: o.json['stemmed'] = stemmed changed = True if changed: F = open(fn, 'w') F.write(o.getJSON()) F.close() return 2 else: return 0
def seekByKey(self, key): f = None # trying a shortcut hv = key.split('-')[0] for v in self.venues: if v.getKey() == hv: # print('\tShortcut to', hv) f = v.seekByKey(key) if f: return f # else: # print('\t', C.red('...failed')) # bruteforce search # print('\tBrute force searching for', key) for v in self.venues: f = v.seekByKey(key) if f: return f print(C.red(key), ' not found in BibSLEIGH!') return f
def main(): print('{}: {} venues, {} papers\n{}'.format(C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('=' * 42))) # generate the index f = open(outputdir + '/index.html', 'w', encoding='utf-8') f.write(sleigh.getPage()) f.close() # generate all individual pages # if False: for v in sleigh.venues: r = C.blue(v.getKey()) f = open(outputdir + '/' + v.getKey() + '.html', 'w', encoding='utf-8') f.write(v.getPage()) f.close() if v.brands: r += '{' + '+'.join([C.blue(b.getKey()) for b in v.brands]) + '}' for b in v.brands: f = open(outputdir + '/' + b.getKey() + '.brand.html', 'w', encoding='utf-8') f.write(b.getPage()) f.close() r += ' => ' for c in v.getConfs(): f = open(outputdir + '/' + c.getKey() + '.html', 'w', encoding='utf-8') f.write(c.getPage()) f.close() for p in c.papers: f = open(outputdir + '/' + p.getKey() + '.html', 'w', encoding='utf-8') f.write(p.getPage()) f.close() purekey = c.getKey().replace(v.getKey(), '').replace('-', ' ').strip() r += '{} [{}], '.format(purekey, C.yellow(len(c.papers))) print(r) # generate the icon lineup icons = [] linked = [] pngs = [ lastSlash(png).split('.')[0] for png in glob.glob(outputdir + '/stuff/*.png') ] pngs = [png for png in pngs \ if not (png.startswith('a-') or png.startswith('p-') or png.startswith('ico-') or png in ('cc-by', 'xhtml', 'css', 'open-knowledge', 'edit'))] for brand in glob.glob(outputdir + '/*.brand.html'): pure = lastSlash(brand).split('.')[0] img = pure.lower().replace(' ', '') if img in pngs: pic = '<div class="wider"><a href="{0}.brand.html"><img class="abc" src="{1}" alt="{0}"/></a><span>{0}</span></div>'.format( \ pure, 'stuff/' + img + '.png') pngs.remove(img) icons.append(pic) else: # print('No image for', pure) pass corner = { 'ada': 'TRI-Ada', 'comparch': 'CompArch', 'floc': 'FLoC', 'bibsleigh': 'index' } for pure in pngs: venueCandidate = corner[pure] if pure in corner else pure.upper() canlink = sorted(glob.glob(outputdir + '/' + venueCandidate + '*.html'), key=len) if canlink: pic = '<div class="wider"><a href="{0}"><img class="abc" src="stuff/{1}.png" alt="{2}"/></a><span>{2}</span></div>'.format( \ canlink[0].split('/')[-1], pure, venueCandidate, canlink[0].split('/')[0]) elif pure == 'twitter': pic = '<div class="wider"><a href="https://about.twitter.com/company/brand-assets"><img class="abc" src="stuff/twitter.png" alt="Twitter"/></a><span>Twitter</span></div>' elif pure == 'email': pic = '<div class="wider"><a href="mailto:[email protected]"><img class="abc" src="stuff/email.png" alt="e-mail"/></a><span>email</span></div>' else: print('Lonely', pure) pic = '<img class="abc" src="stuff/{0}.png" alt="{0}"/>'.format( pure) icons.append(pic) # find last year of each venue # for ven in glob.glob(corpusdir + '/*'): # venname = lastSlash(ven) # newstuff += '<strong><a href="http://dblp.uni-trier.de/db/conf/{}/">{} {}</a></strong>, '.format(venname.lower(), venname, nextYear(ven)) # print(lastSlash(ven), ':', lastYear(ven)) # write "more info" file f = open(outputdir + '/about.html', 'w', encoding='utf-8') f.write( aboutHTML.format( len(icons), '<div class="minibar">' + '\n'.join(sorted(icons)) + '</div>')) f.close() # generate the DBLP sync page cell_by_conf_by_year = {} Ys = [ 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009 ] dblplinks = {} with open(ienputdir + '/meta/dblpguide.sync', 'r') as f: for line in f: if not line or line.startswith('#'): continue words = line.split('|') if len(words) != 3: print('- Metaline {} skipped!'.format(words)) continue name = words[0].strip() dome = words[1].strip() dblp = words[2].strip() cell_by_conf_by_year[name] = {} dblplinks[name] = dblp for y in Ys: cell_by_conf_by_year[name][y] = '(no)' v = sleigh.getVenue(dome) if v: for yy in Ys: y = v.getYear(yy) if y: ckey = '{}-{}'.format(name, yy) c = y.getConf(ckey) if c: cell_by_conf_by_year[name][yy] = c.getIconItem2( '', '') else: # print('- Conference {} of year {} in venue {} not found in the corpus'.format(ckey, yy, name)) for alt in 'v1', 'p1', 'c1', '1', 'J': ckey = '{}-{}-{}'.format(name, alt, yy) c = y.getConf(ckey) if c: cell_by_conf_by_year[name][ yy] = c.getIconItem2('', '') break # else: # print('- Year {} in venue {} not found in the corpus among {}'.format(yy, name, [z.year for z in v.years])) # else: # print('- Venue {} not found in the corpus'.format(name)) table = '<table>' table += '<tr><td></td>' for y in Ys: table += '<th>{}</th>\n'.format(y) table += '</tr>' # print (cell_by_conf_by_year) for name in sorted(cell_by_conf_by_year.keys()): table += '<tr><th><a href="{}.brand.html">[@]</a> <a href="{}">{}</a></th>'.format( name, dblplinks[name], name) for y in Ys: table += '<td>{}</td>\n'.format(cell_by_conf_by_year[name][y]) table += '</tr>' table += '</table>' with open(outputdir + '/sync.html', 'w', encoding='utf-8') as f: f.write(syncHTML.format(table)) print('{}\nDone with {} venues, {} papers.'.format( C.purple('=' * 42), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers())))
if isinstance(o, int): r = o else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1]))) print(C.red('{} files to check manually!'.format(len(warnings)))) print('subl ', ' '.join(warnings))
def checkon(fn, o): if os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() flines = [strictstrip(s) for s in lines] plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: return 1 ts = [] # precise case-sensitive match mcs = o.get('title') # precise match for substrings mes = baretext(mcs) # precise match for words mew = mes.split(' ') # imprecise match for substrings mis = superbaretext(mes) # imprecise match for words miw = mis.split(' ') # now match! for t in tags: # print('Check',t,'vs',mes) if 'name' not in t.keys(): print(C.red('ERROR:'), 'no name for tag from file', t['FILE']) continue if all([not k.startswith('match') for k in t.keys()]): print(C.red('ERROR:'), 'no match rules for tag', t['name']) continue for k in t.keys(): if k == 'matchentry': if o.getKey() in t[k]: ts += [t['name']] elif k.startswith('match'): ts += [ t['name'] for s in listify(t[k]) if matchModes[k](s, mcs, mes, mew, mis, miw) ] # ts += [t['name'] for s in listify(t[k]) if fmm(t, k, s, mcs, mes, mew, mis, miw)] # second pass: check reliefs for t in tags: if 'relieves' in t.keys(): for r in listify(t['relieves']): if t['name'] in ts and r in ts: ts.remove(r) if t['name'] not in relieved.keys(): relieved[t['name']] = 0 relieved[t['name']] += 1 if ts: if not o.tags: o.tags = [] for t in ts: if t not in o.tags: o.tags.append(t) # uncomment the following one line to overwrite all tags o.tags = uniq(ts) # let’s keep tags clean and sorted o.tags = sorted(o.tags) nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
paperAuths = paperAuths[:-1] paperAuths.extend(auths) paperLnk = li.get('id') hope = li.find_all('a') if hope and hope[0].get('href').endswith('.pdf'): paperPdf = urlstart + hope[0].get('href') else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = outputdir.split('/')[-1] + '-' + paperAuths[0].split(' ')[-1] for a in paperAuths[1:]: paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr(ord(paperFilename[-1])+1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir+'/'+paperFilename+'.json', 'w') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
else: name = sys.argv[1] path = ienputdir + '/corpus/' + name namem = name.split('/')[-1] cx = {0: 0, 1: 0, 2: 0} if not os.path.exists(path): report(name, name, 1) sys.exit(1) # for all papers... for fn in glob.glob(path + '/*.json'): pureold = fn.split(namem+'/')[1] if pureold.endswith('.json'): pureold = pureold[:-5] purenew = pureold if purenew[-2:] == namem[-2:]: purenew = purenew[:-2] if pureold == purenew: cx[report(pureold, purenew, 0)] += 1 elif not os.path.exists(ienputdir + '/corpus/' + name + '/' + pureold + '.json')\ and os.path.exists(ienputdir + '/corpus/' + name + '/' + purenew + '.json'): cx[report(pureold, purenew, 1)] += 1 else: cx[report(pureold, purenew, 2)] += 1 os.rename(ienputdir + '/corpus/' + name + '/' + pureold + '.json', \ ienputdir + '/corpus/' + name + '/' + purenew + '.json') print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country)-1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town)-1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town)-1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town)-1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state)-1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state=='?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1
statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if isinstance(o, int): r = o else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1]))) print(C.red('{} files to check manually!'.format(len(warnings)))) print('subl ', ' '.join(warnings))
def sdistance(x1, x2): return str(distance(x1, x2)).replace('.', ',') def distance(x1, x2): return sqrt(sum([(x1[jj] - x2[jj])**2 for jj in range(0, len(x1))])) # NB: some clustering/visualisation code based on http://brandonrose.org/clustering if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # we need to know all the words we have UberDict = set() UberCols = set() vocs = {b.getKey():b.json['vocabulary'] \ for v in sleigh.venues \ for b in v.getBrands() \ if 'vocabulary' in b.json \ if len(b.json['vocabulary']) > 10} for vkey in vocs: UberDict.update(vocs[vkey].keys()) # collocations are not quantified!
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], simpleLatin(s))) return r
# non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], s)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' # Load all contributors people = {} for fn in glob.glob(ienputdir + '/people/*.json'): p = parseJSON(fn) people[p['name']] = p print('{}: {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(people)), C.purple('='*42))) # check for duplicates bysurname = {} for name in people.keys(): byword = name.split(' ') j = -1 while -j < len(byword) and (byword[j - 1][0].islower() or byword[j - 1].lower() in ('de', 'di', 'du', 'van', 'von', 'le' 'la')): j -= 1 surname = ' '.join(byword[j:]) firstnames = ' '.join(byword[:j]) if verbose: print('Thinking “{}” is “{}” + “{}”'.format(
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country) - 1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town) - 1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town) - 1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town) - 1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state) - 1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state == '?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1
else: y = v.replace('http://', '').replace('https://', '') r = '<a href="{0}">{1}</a>'.format(v, y) elif k == 'aka': ico = '' r = '<br/>'.join(['a.k.a.: “{}”'.format(z) for z in listify(v)]) else: ico = '' r = '?{}?{}?'.format(k, v) return ico + ' ' + r + '<br/>' if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) ts = sleigh.getTags() tagged = [] for key in ts.keys(): f = open('{}/tag/{}.html'.format(outputdir, key), 'w', encoding='utf-8') # papers are displayed in reverse chronological order lst = [x.getRestrictedItem(key) for x in \ sorted(ts[key], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)] # no comprehension possible for this case for x in ts[key]: if x not in tagged: tagged.append(x)
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], s)) return r
v = 'http://'+v else: y = v.replace('http://', '').replace('https://', '') r = '<a href="{0}">{1}</a>'.format(v, y) elif k == 'aka': ico = '' r = '<br/>'.join(['a.k.a.: “{}”'.format(z) for z in listify(v)]) else: ico = '' r = '?{}?{}?'.format(k, v) return ico + ' ' + r + '<br/>' if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) ts = sleigh.getTags() tagged = [] for key in ts.keys(): f = open('{}/tag/{}.html'.format(outputdir, key), 'w') # papers are displayed in reverse chronological order lst = [x.getRestrictedItem(key) for x in \ sorted(ts[key], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)] # no comprehension possible for this case for x in ts[key]: if x not in tagged: tagged.append(x) # read tag definition tagdef = parseJSON(ienputdir + '/tags/{}.json'.format(key))
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for k in o.json.keys(): if 'type' not in o.json.keys(): print('TERRIBLE', o.getKey()) if (o.json['type'] == 'proceedings' and k == 'title') or\ (o.json['type'] == 'inproceedings' and k == 'booktitle'): # fix numbers for nr in nrs.keys(): if o.json[k].find(' ' + nr + ' ') > -1: o.json[k] = o.json[k].replace(' ' + nr + ' ', ' ' + nrs[nr] + ' ') if isinstance(o.json[k], str): # add emdashes for fancier titles if k in ('title', 'booktitle'): o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ') # Nice heuristic to run from time to time, but reports too much # on stuff like “eXtreme” and “jPET” # if o.json[k][0].islower(): # print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title'))) # normalised pages if k == 'pages': o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace( '−', '-') # double spaces if o.json[k].find(' ') > -1: o.json[k] = o.json[k].replace(' ', ' ').strip() # find numeric values, turn them into proper integers if o.json[k].isdigit(): o.json[k] = int(o.json[k]) continue # remove confix curlies elif o.json[k].startswith('{') and o.json[k].endswith('}'): o.json[k] = o.json[k][1:-1] # single quotes to double quotes elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1: o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ') elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"): o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"') elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"): o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"') # fancify bland quotes elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1: o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ') elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'): o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”') elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'): o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“') # fancify LaTeX quotes elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1: o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “') elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"): o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “') elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'): o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“') elif o.json[k].startswith('``') and o.json[k].endswith("''"): o.json[k] = '“' + o.json[k][2:-2] + '”' # plural possessive elif o.json[k].find("'s") > -1: o.json[k] = o.json[k].replace("'s", '’s') elif o.json[k].find("s' ") > -1: o.json[k] = o.json[k].replace("s'", 's’') # contractions elif o.json[k].find("n't") > -1: o.json[k] = o.json[k].replace("n't", 'n’t') # the case of "Jr" vs "Jr." if k in ('author', 'editor') and o.json[k].endswith('Jr'): o.json[k] += '.' # TODO: report remaining suspicious activity for c in '`"\'': # ’ is ok if c in o.json[k] and k not in ('author', 'editor'): print('[ {} ] {}: {} is “{}”'.format( C.red('LOOK'), o.getKey(), k, o.json[k])) lookat.append(o.filename) elif isinstance(o.json[k], list): # inline trivial lists if len(o.json[k]) == 1: o.json[k] = o.json[k][0] # inline hidden trivial lists if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \ and k not in ('stemmed', 'tag', 'tagged'): o.json[k] = o.json[k][0] # unless it’s 'tagged' if k == 'tagged' and not isinstance(o.json[k][0], list): o.json[k] = [o.json[k]] # remove DBLP disambiguation: we might later regret it # but the information can be always re-retrieved if k in ('author', 'editor'): nas = [] for a in o.json[k]: # double spaces if a.find(' ') > -1: a = a.replace(' ', ' ').strip() ws = a.split(' ') if ws[-1].isdigit(): ws = ws[:-1] nas.append(' '.join(ws)) o.json[k] = nas # the case of "Jr" vs "Jr." o.json[k] = [ a + '.' if a.endswith(' Jr') else a for a in o.json[k] ] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r def two(n): if n < 10: return '0{}'.format(n) else: return '{}'.format(n) if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # stem ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p, None)] += 1 for b in v.getBrands(): cx[checkreport(b.filename, None, b)] += 1 # write all stems listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w) f = open(ienputdir + '/stems.json', 'w') f.write('[\n\t"' + '",\n\t"'.join(listOfStems) + '"\n]')
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for k in o.json.keys(): if 'type' not in o.json.keys(): print('TERRIBLE',o.getKey()) if (o.json['type'] == 'proceedings' and k == 'title') or\ (o.json['type'] == 'inproceedings' and k == 'booktitle'): # fix numbers for nr in nrs.keys(): if o.json[k].find(' '+nr+' ') > -1: o.json[k] = o.json[k].replace(' '+nr+' ', ' '+nrs[nr]+' ') if isinstance(o.json[k], str): # add emdashes for fancier titles if k in ('title', 'booktitle'): o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ') # Nice heuristic to run from time to time, but reports too much # on stuff like “eXtreme” and “jPET” # if o.json[k][0].islower(): # print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title'))) # normalised pages if k == 'pages': o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace('−', '-') # double spaces if o.json[k].find(' ') > -1: o.json[k] = o.json[k].replace(' ', ' ').strip() # find numeric values, turn them into proper integers if o.json[k].isdigit(): o.json[k] = int(o.json[k]) continue # remove confix curlies elif o.json[k].startswith('{') and o.json[k].endswith('}'): o.json[k] = o.json[k][1:-1] # single quotes to double quotes elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1: o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ') elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"): o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"') elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"): o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"') # fancify bland quotes elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1: o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ') elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'): o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”') elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'): o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“') # fancify LaTeX quotes elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1: o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “') elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"): o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “') elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'): o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“') elif o.json[k].startswith('``') and o.json[k].endswith("''"): o.json[k] = '“' + o.json[k][2:-2] + '”' # plural possessive elif o.json[k].find("'s") > -1: o.json[k] = o.json[k].replace("'s", '’s') elif o.json[k].find("s' ") > -1: o.json[k] = o.json[k].replace("s'", 's’') # contractions elif o.json[k].find("n't") > -1: o.json[k] = o.json[k].replace("n't", 'n’t') # the case of "Jr" vs "Jr." if k in ('author', 'editor') and o.json[k].endswith('Jr'): o.json[k] += '.' # TODO: report remaining suspicious activity for c in '`"\'': # ’ is ok if c in o.json[k] and k not in ('author', 'editor'): print('[ {} ] {}: {} is “{}”'.format(C.red('LOOK'), o.getKey(), k, o.json[k])) lookat.append(o.filename) elif isinstance(o.json[k], list): # inline trivial lists if len(o.json[k]) == 1: o.json[k] = o.json[k][0] # inline hidden trivial lists if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \ and k not in ('stemmed', 'tag', 'tagged'): o.json[k] = o.json[k][0] # unless it’s 'tagged' if k == 'tagged' and not isinstance(o.json[k][0], list): o.json[k] = [o.json[k]] # remove DBLP disambiguation: we might later regret it # but the information can be always re-retrieved if k in ('author', 'editor'): nas = [] for a in o.json[k]: # double spaces if a.find(' ') > -1: a = a.replace(' ', ' ').strip() ws = a.split(' ') if ws[-1].isdigit(): ws = ws[:-1] nas.append(' '.join(ws)) o.json[k] = nas # the case of "Jr" vs "Jr." o.json[k] = [a+'.' if a.endswith(' Jr') else a for a in o.json[k]] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
from fancy.Templates import wordlistHTML, wordHTML from lib.AST import Sleigh, escape from lib.JSON import parseJSON from lib.NLP import ifApproved from collections import Counter ienputdir = '../json' outputdir = '../frontend' n2f_name = '_name2file.json' name2file = parseJSON(n2f_name) if os.path.exists(n2f_name) else {} sleigh = Sleigh(ienputdir + '/corpus', name2file) if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) stems = sleigh.getStems() tagged = [] for k in stems.keys(): f = open('{}/word/{}.html'.format(outputdir, k), 'w', encoding='utf-8') # papers are displayed in reverse chronological order lst = [x.getIItem() for x in \ sorted(stems[k], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)] # collect other stems # NB: do not use the following code, slows everything down from 1 minute to 161 minutes # allstems = [] # for x in stems[k]: # allstems += x.getBareStems() # siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)}
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) # read the CSV f = open('scrap-committees/scraped-by-grammarware.csv', 'r', encoding='utf-8') # CBSE;2001;Heinz;Schmidt;;Organising Committee for line in f.readlines(): vs = line.strip().split(';') if len(vs) == 0: continue v = vs[0] + '-' + vs[1] n = vs[2] + ' ' + vs[3] # normalise! if n in renameto.keys():
else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = lastSlash(outputdir) + '-' + paperAuths[0].split( ' ')[-1] for a in paperAuths[1:]: print(a) paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr( ord(paperFilename[-1]) + 1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir + '/' + paperFilename + '.json', 'w', encoding='utf-8') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
def report(one, two): print('[ {} ] {}'.format(one, two)) def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
def report(fn1, fn2, r): statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2)) return r
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # "url" from DBLP are useless if 'url' in o.json.keys(): o.json['url'] = [link.replace('https://', 'http://')\ for link in listify(o.json['url'])\ if not link.startswith('db/conf/')\ and not link.startswith('db/series/')\ and not link.startswith('db/books/')\ and not link.startswith('db/journals/')] if not o.json['url']: del o.json['url'] elif len(o.json['url']) == 1: o.json['url'] = o.json['url'][0] if 'ee' in o.json.keys() and 'doi' not in o.json.keys(): if isinstance(o.json['ee'], list): if verbose: print(C.red('Manylink:'), o.json['ee']) newee = [] for onelink in listify(o.json['ee']): if onelink.startswith('http://dx.doi.org/'): o.json['doi'] = onelink[18:] elif onelink.startswith('http://doi.acm.org/'): o.json['doi'] = onelink[19:] elif onelink.startswith('http://doi.ieeecomputersociety.org/'): o.json['doi'] = onelink[35:] elif onelink.startswith('http://dl.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[34:] elif onelink.startswith('http://portal.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[38:] elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\ or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='): o.json['ieeearid'] = onelink.split('=')[-1] elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\ and onelink.find('arnumber') > -1: o.json['ieeearid'] = onelink.split('arnumber=')[-1].split( '&')[0] elif onelink.startswith( 'http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=' ): o.json['ieeepuid'] = onelink.split('=')[-1] elif onelink.startswith( 'http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='): o.json['ieeeisid'] = onelink.split('=')[-1] elif onelink.startswith( 'http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/' ): newee.append( 'http://journal.ub.tu-berlin.de/eceasst/article/view/' + onelink.split('/')[-1]) elif onelink.endswith('.pdf') and \ (onelink.startswith('http://computer.org/proceedings/')\ or onelink.startswith('http://csdl.computer.org/')): # Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf # Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf # Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf if onelink.startswith('http://csdl'): cname, _, cid, mid, pid = onelink.split('/')[5:10] else: cname, cid, pid = onelink.split('/')[4:7] # heuristic if pid.startswith(cid): mid = pid[len(cid):len(cid) + 2] else: mid = '00' newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\ cname, o.get('year'), cid, mid, pid)) else: if onelink.find('ieee') > -1: print(C.purple('IEEE'), onelink) if verbose: print(C.yellow('Missed opportunity:'), onelink) # nothing matches => preserve newee.append(onelink) if len(newee) == 0: del o.json['ee'] elif len(newee) == 1: o.json['ee'] = newee[0] else: o.json['ee'] = newee # post-processing normalisation if 'acmid' in o.json.keys() and not isinstance( o.json['acmid'], int) and o.json['acmid'].isdigit(): o.json['acmid'] = int(o.json['acmid']) if 'eventuri' in o.json.keys(): o.json['eventurl'] = o.json['eventuri'] del o.json['eventuri'] if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith( 'https://'): o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://') nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
return 0 def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) # read the CSV f = open('scrap-committees/scraped-by-grammarware.csv', 'r') # CBSE;2001;Heinz;Schmidt;;Organising Committee for line in f.readlines(): vs = line.strip().split(';') if len(vs) == 0: continue v = vs[0] + '-' + vs[1] n = vs[2] + ' ' + vs[3] # normalise! if n in renameto.keys(): print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n]) n = renameto[n]
def linkto(n): if n in name2file: return '<a href="{}">{}</a>'.format(name2file[n], shorten(n)) else: return n def pad(n): X = str(n) while len(X) < 4: X = '0' + X return X if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) ps = [] # flatten the sleigh bykey = {} for v in sleigh.venues: bykey[v.getKey()] = v for c in v.getConfs(): bykey[c.getKey()] = c for p in c.papers: bykey[p.getKey()] = p print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey)))) # tagged = [] # for k in ts.keys(): peoples = {}
def checkon(fn, o): if os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() flines = [strictstrip(s) for s in lines] plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: return 1 ts = [] # precise case-sensitive match mcs = o.get('title') # precise match for substrings mes = baretext(mcs) # precise match for words mew = mes.split(' ') # imprecise match for substrings mis = superbaretext(mes) # imprecise match for words miw = mis.split(' ') # now match! for t in tags: # print('Check',t,'vs',mes) if 'name' not in t.keys(): print(C.red('ERROR:'), 'no name for tag from file', t['FILE']) continue if all([not k.startswith('match') for k in t.keys()]): print(C.red('ERROR:'), 'no match rules for tag', t['name']) continue for k in t.keys(): if k == 'matchentry': if o.getKey() in t[k]: ts += [t['name']] elif k.startswith('match'): ts += [t['name'] for s in listify(t[k]) if matchModes[k](s, mcs, mes, mew, mis, miw)] # ts += [t['name'] for s in listify(t[k]) if fmm(t, k, s, mcs, mes, mew, mis, miw)] # second pass: check reliefs for t in tags: if 'relieves' in t.keys(): for r in listify(t['relieves']): if t['name'] in ts and r in ts: ts.remove(r) if t['name'] not in relieved.keys(): relieved[t['name']] = 0 relieved[t['name']] += 1 if ts: if not o.tags: o.tags = [] for t in ts: if t not in o.tags: o.tags.append(t) # uncomment the following one line to overwrite all tags o.tags = uniq(ts) # let’s keep tags clean and sorted o.tags = sorted(o.tags) nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
return '<a href="{}">{}</a>'.format(name2file[n], shorten(n)) else: return n def pad(n): X = str(n) while len(X) < 4: X = '0' + X return X if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) ps = [] # flatten the sleigh bykey = {} for v in sleigh.venues: bykey[v.getKey()] = v for c in v.getConfs(): bykey[c.getKey()] = c for p in c.papers: bykey[p.getKey()] = p print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey)))) # tagged = [] # for k in ts.keys(): peoples = {}
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: # tags per venue for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for b in v.brands: cx[checkreport(b.filename, b)] += 1 cx[checkreport(v.filename, v)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
people.update(listify(c.json['editor'])) for p in c.papers: if 'author' in p.json: people.update(listify(p.json['author'])) for a in people: for na in (nodiaLatin(a), simpleLatin(a)): if na != a: aka.setdefault(a, []) aka[a].append(na) # invert aliasing for akey in aka: if akey in ('ZZZZZZZZZZ', 'FILE'): continue for aval in aka[akey]: renameto[aval] = akey f = open('_renameto.json', 'w', encoding='utf-8') f.write(json.dumps(renameto, sort_keys=True, separators=(',\n\t', ': '), ensure_ascii=False)) f.close() cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} aliasing rules, {} of them manual.'.format(len(renameto), CX)) print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
return r def two(n): if n < 10: return '0{}'.format(n) else: return '{}'.format(n) if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # stem ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p, None)] += 1 for b in v.getBrands(): cx[checkreport(b.filename, None, b)] += 1 # write all stems listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w) f = open(ienputdir + '/stems.json', 'w', encoding='utf-8')
f = open('../conferenceMetrics/data/SE-conf-roles.csv', 'r') for line in f.readlines(): # Conference;Year;First Name;Last Name;Sex;Role csv.append(line.strip().split(';')) f.close() f = open('scrap-committees/scraped-by-grammarware.csv', 'r') for line in f.readlines(): csv.append(line.strip().split(';')) f.close() # All known contributors people = {} for fn in glob.glob(ienputdir + '/people/*.json'): p = parseJSON(fn) # people.append(p) if 'name' not in p.keys(): print('[', C.red('NOGO'), ']', 'No name in', fn) continue people[p['name']] = p print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) # All people who ever contributed names = [] for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: for k in ('author', 'editor'): if k in p.json.keys(): names += [a for a in listify(p.json[k]) if a not in names]
statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], simpleLatin(s))) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' # All known contributors cx = {0: 0, 1: 0, 2: 0} people = {} for fn in glob.glob(ienputdir + '/people/*.json'): p = parseJSON(fn) if p['name'] in people.keys(): cx[report(C.red('duplicate') + ' ' + C.yellow(p), 1)] += 1 continue people[p['name']] = p print('{}: {} venues, {} papers written by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(people)), C.purple('='*42))) # traverse ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: if 'author' in p.json.keys(): for a in listify(p.json['author']): if a in people.keys():