def report(fn, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX')) special = ('', '- no crossref found!', '- illegal crossref') # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} {}'.format(statuses[r], fn, special[r])) return r
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=fn.split('/')[-1][:-5].replace('-', ' '), year=findYear(fn.split('/')[-1])\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkreport(m, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(m, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], o.filename)) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w', encoding='utf-8') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=lastSlash(fn)[:-5].replace('-', ' '), year=findYear(lastSlash(fn))\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('WARN')) r, msg = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}: {}'.format(statuses[r], fn, msg)) return r
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if isinstance(o, int): r = o else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkreport(fn, o, br): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if br: r = checkbrand(fn, br) else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r', encoding='utf-8') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2
def checkon(m, o): # if no common model found, we failed if not m: return 1 if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'): m['type'] = 'proceedings' if 'type' in m.keys() and m['type'] == 'incollection': m['type'] = 'book' if 'crossref' in m.keys(): del m['crossref'] if 'booktitle' in m.keys(): m['title'] = m['booktitle'] del m['booktitle'] if 'booktitleshort' in m.keys(): # TODO: ??? del m['booktitleshort'] r = 0 n = {} for k in m.keys(): if o.get(k) == m[k]: if verbose: print(C.blue('Confirmed: '), k, 'as', m[k]) else: if verbose: print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k)) v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k] if verbose: print(C.yellow('Settled for:'), v) n[k] = v r = 2 if r == 0: return r if r == 2 and not n: # nothing to fix?! return 0 if not os.path.exists(o.filename): return 0 if os.path.isdir(o.filename): fn = o.filename + '.json' else: fn = o.filename if os.path.exists(fn): f = open(fn, 'r') lines = f.read() f.close() if lines != o.getJSON(): # strange, should be equal (run all normalisers first!) return 1 for k in n.keys(): o.json[k] = n[k] f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2
def __init__(self, d, hdir, name2file, parent): super(Venue, self).__init__(d, hdir) self.years = [] self.brands = [] self.n2f = name2file if os.path.exists(d+'.json'): # new style # print(C.blue(d), 'is new style') self.json = parseJSON(d+'.json') else: # legacy style print(C.red(d), 'is legacy style') self.json = [] for f in glob.glob(d+'/*.json'): if not self.json: self.json = parseJSON(f) else: self.brands.append(Brand(f, self.homedir, name2file, self)) for f in glob.glob(d+'/*'): if f.endswith('.json'): # already processed continue elif os.path.isdir(f): y = Year(f, self.homedir, name2file, self) self.years.append(y) for b in self.brands: for c in y.confs: b.offer(y.year, c) else: print('File out of place:', f) self.back = parent
def __init__(self, d, hdir, name2file, parent): super(Venue, self).__init__(d, hdir) self.years = [] self.brands = [] self.n2f = name2file if os.path.exists(d + '.json'): # new style # print(C.blue(d), 'is new style') self.json = parseJSON(d + '.json') else: # legacy style print(C.red(d), 'is legacy style') self.json = {} for f in glob.glob(d + '/*.json'): if not self.json: self.json = parseJSON(f) else: self.brands.append(Brand(f, self.homedir, name2file, self)) for f in glob.glob(d + '/*'): if f.endswith('.json'): # already processed continue elif os.path.isdir(f): y = Year(f, self.homedir, name2file, self) self.years.append(y) for b in self.brands: for c in y.confs: b.offer(y.year, c) else: print('File out of place:', f) self.back = parent
def processSortedRel(r): # [ {"x" : Y } ] where Y can be a string or a sorted rel global pcx acc = [] for el in r: ename = list(el.keys())[0] evals = el[ename] if os.path.isfile(outputdir + '/stuff/' + ename.lower() + '.png'): img = '<img src="../stuff/{1}.png" alt="{0}" width="30px"/> '.format(ename, ename.lower()) else: img = '' if isinstance(evals, str): plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages) pcx += len(plst) ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>' elif isinstance(evals, list) and isinstance(evals[0], str): plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages) pcx += len(plst) ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>' elif isinstance(evals, list) and isinstance(evals[0], dict): ptxt = processSortedRel(evals) else: print(C.red('ERROR:'), 'unrecornised bundle structure', evals) acc.append('<dl><dt>{}{}</dt><dd>{}</dl>'.format(img, ename, ptxt)) return '\n'.join(acc)
def __init__(self, idir, name2file): super(Sleigh, self).__init__('', idir) self.venues = [] self.n2f = name2file jsons = {} skip4Now = [] for d in glob.glob(idir+'/*.json'): if d.split('/')[-1].split('.')[0] in skip4Now: print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) continue jsons[d.split('/')[-1].split('.')[0]] = d for d in glob.glob(idir+'/*'): cont = False for end in ('.md', '.json', '/frem', '/edif'): if d.endswith(end): cont = True if d.split('/')[-1] in skip4Now: print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) cont = True if cont: continue if d.split('/')[-1] not in jsons.keys(): print(C.red('Legacy non-top definition of'), d) self.venues.append(Venue(d, idir, name2file, self)) else: self.venues.append(Venue(d, idir, name2file, self))
def __init__(self, idir, name2file): super(Sleigh, self).__init__('', idir) self.venues = [] self.n2f = name2file jsons = {} skip4Now = [] for d in glob.glob(idir + '/*.json'): if lastSlash(d).split('.')[0] in skip4Now: print( C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) continue jsons[lastSlash(d).split('.')[0]] = d for d in glob.glob(idir + '/*'): cont = False for end in ('.md', '.json', '/frem', '/edif'): if d.endswith(end): cont = True if d.split('/')[-1] in skip4Now: print( C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now')) cont = True if cont: continue if lastSlash(d) not in jsons.keys(): print(C.red('Legacy non-top definition of'), d) if lastSlash(d) not in ('edif', 'frem'): self.venues.append(Venue(d, idir, name2file, self)) else: self.venues.append(Venue(d, idir, name2file, self))
def parseJSON(fn): # print('Parsing',fn,'...') try: j = json.load(open(fn, 'r', encoding='utf-8')) j['FILE'] = fn return j except ValueError: print(C.red('JSON parse error'), 'on', fn.replace('\\', '/')) return {}
def parseJSON(fn): # print('Parsing',fn,'...') try: j = json.load(open(fn, 'r')) j['FILE'] = fn return j except ValueError: print(C.red('JSON parse error'), 'on', fn) return {}
def guessYear(P): cys = [int(w) for w in P.split('-') if len(w) == 4 and w.isdigit()] if len(cys) == 1: return cys[0] else: j = sleigh.seekByKey(P) if 'year' in j.json.keys(): return j.get('year') elif 'year' in dir(j): return j.year else: print('[ {} ] {}'.format(C.red('YEAR'), P)) return 0
def guessYear(p): cys = [int(w) for w in p.split('-') if len(w) == 4 and w.isdigit()] if len(cys) == 1: return cys[0] else: j = sleigh.seekByKey(p) if 'year' in j.json.keys(): return j.get('year') elif 'year' in dir(j): return j.year else: print('[ {} ] {}'.format(C.red('YEAR'), p)) return 0
def sortbypages(z): if 'pages' not in z.json.keys(): print(C.red('No pages at all in '+z.getKey())) return 0 p1, _ = z.getPagesTuple() y = z.get('year') if isinstance(y, str): # non-correcting robustness return 0 # a trick to have several volumes within one conference v = z.get('volume') if isinstance(v, int) or v.isdigit(): y += int(v) return y + p1 / 10000. if p1 else y
def sortbypages(z): if 'pages' not in z.json.keys(): print(C.red('No pages at all in ' + z.getKey())) return 0 p1, _ = z.getPagesTuple() y = z.get('year') if isinstance(y, str): # non-correcting robustness return 0 # a trick to have several volumes within one conference v = z.get('volume') if isinstance(v, int) or v.isdigit(): y += int(v) return y + p1 / 10000. if p1 else y
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if 'title' not in o.json.keys(): if verbose: print('No title in', o.getKey()) return 1 # no title # check for a different language - to avoid stemming altogether if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags): if 'stemmed' in o.json.keys(): # if stemmed before marked foreign, remove this info del o.json['stemmed'] F = open(fn, 'w', encoding='utf-8') F.write(o.getJSON()) F.close() return 2 else: return 0 changed = False ### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles stemmer = snowballstemmer.stemmer('english').stemWords ### disregarded variant: snowballstemmer porter - considered outdated # stemmer = snowballstemmer.stemmer('porter').stemWords ### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles # stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs] ### disregarded variant: nltk - worse on verbs ending with -ze # stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs] ### end variants stemmed = stemmer(string2words(o.get('title'))) if '' in stemmed: print('“{}” is a title of {} and it has an empty word'.format( o.get('title'), C.red(o.getKey()))) print(string2words(o.get('title'))) print(stemmer(string2words(o.get('title')))) ALLSTEMS.update(stemmed) if o.get('stemmed') != stemmed: o.json['stemmed'] = stemmed changed = True if changed: F = open(fn, 'w', encoding='utf-8') F.write(o.getJSON()) F.close() return 2 else: return 0
def dblpify(s): # http://dblp.uni-trier.de/pers/hd/e/Elbaum:Sebastian_G= if s in dis.keys(): return dis[s] if s.find(' ') < 0: print('[', C.red('NAME'), ']', 'Unconventional full name:', s) cx[1] += 1 return dblpLatin(s)+':' ws = s.split(' ') i = -1 if ws[i] in ('Jr', 'Jr.'): i -= 1 sur = dblpLatin(' '.join(ws[i:])) rest = dblpLatin(' '.join(ws[:i])).replace(' ', '_') for c in ".'-": rest = rest.replace(c, '=') return sur+':'+rest
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if 'title' not in o.json.keys(): if verbose: print('No title in', o.getKey()) return 1 # no title # check for a different language - to avoid stemming altogether if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags): if 'stemmed' in o.json.keys(): # if stemmed before marked foreign, remove this info del o.json['stemmed'] F = open(fn, 'w') F.write(o.getJSON()) F.close() return 2 else: return 0 changed = False ### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles stemmer = snowballstemmer.stemmer('english').stemWords ### disregarded variant: snowballstemmer porter - considered outdated # stemmer = snowballstemmer.stemmer('porter').stemWords ### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles # stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs] ### disregarded variant: nltk - worse on verbs ending with -ze # stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs] ### end variants stemmed = stemmer(string2words(o.get('title'))) if '' in stemmed: print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey()))) print(string2words(o.get('title'))) print(stemmer(string2words(o.get('title')))) ALLSTEMS.update(stemmed) if o.get('stemmed') != stemmed: o.json['stemmed'] = stemmed changed = True if changed: F = open(fn, 'w') F.write(o.getJSON()) F.close() return 2 else: return 0
def seekByKey(self, key): f = None # trying a shortcut hv = key.split('-')[0] for v in self.venues: if v.getKey() == hv: # print('\tShortcut to', hv) f = v.seekByKey(key) if f: return f # else: # print('\t', C.red('...failed')) # bruteforce search # print('\tBrute force searching for', key) for v in self.venues: f = v.seekByKey(key) if f: return f print(C.red(key), ' not found in BibSLEIGH!') return f
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # "url" from DBLP are useless if 'url' in o.json.keys(): o.json['url'] = [link.replace('https://', 'http://')\ for link in listify(o.json['url'])\ if not link.startswith('db/conf/')\ and not link.startswith('db/series/')\ and not link.startswith('db/books/')\ and not link.startswith('db/journals/')] if not o.json['url']: del o.json['url'] elif len(o.json['url']) == 1: o.json['url'] = o.json['url'][0] if 'ee' in o.json.keys() and 'doi' not in o.json.keys(): if isinstance(o.json['ee'], list): if verbose: print(C.red('Manylink:'), o.json['ee']) newee = [] for onelink in listify(o.json['ee']): if onelink.startswith('http://dx.doi.org/'): o.json['doi'] = onelink[18:] elif onelink.startswith('http://doi.acm.org/'): o.json['doi'] = onelink[19:] elif onelink.startswith('http://doi.ieeecomputersociety.org/'): o.json['doi'] = onelink[35:] elif onelink.startswith('http://dl.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[34:] elif onelink.startswith('http://portal.acm.org/citation.cfm?id='): o.json['acmid'] = onelink[38:] elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\ or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='): o.json['ieeearid'] = onelink.split('=')[-1] elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\ and onelink.find('arnumber') > -1: o.json['ieeearid'] = onelink.split('arnumber=')[-1].split( '&')[0] elif onelink.startswith( 'http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=' ): o.json['ieeepuid'] = onelink.split('=')[-1] elif onelink.startswith( 'http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='): o.json['ieeeisid'] = onelink.split('=')[-1] elif onelink.startswith( 'http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/' ): newee.append( 'http://journal.ub.tu-berlin.de/eceasst/article/view/' + onelink.split('/')[-1]) elif onelink.endswith('.pdf') and \ (onelink.startswith('http://computer.org/proceedings/')\ or onelink.startswith('http://csdl.computer.org/')): # Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf # Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf # Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf if onelink.startswith('http://csdl'): cname, _, cid, mid, pid = onelink.split('/')[5:10] else: cname, cid, pid = onelink.split('/')[4:7] # heuristic if pid.startswith(cid): mid = pid[len(cid):len(cid) + 2] else: mid = '00' newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\ cname, o.get('year'), cid, mid, pid)) else: if onelink.find('ieee') > -1: print(C.purple('IEEE'), onelink) if verbose: print(C.yellow('Missed opportunity:'), onelink) # nothing matches => preserve newee.append(onelink) if len(newee) == 0: del o.json['ee'] elif len(newee) == 1: o.json['ee'] = newee[0] else: o.json['ee'] = newee # post-processing normalisation if 'acmid' in o.json.keys() and not isinstance( o.json['acmid'], int) and o.json['acmid'].isdigit(): o.json['acmid'] = int(o.json['acmid']) if 'eventuri' in o.json.keys(): o.json['eventurl'] = o.json['eventuri'] del o.json['eventuri'] if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith( 'https://'): o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://') nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
# non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r def two(n): if n < 10: return '0{}'.format(n) else: return '{}'.format(n) if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # stem ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p, None)] += 1 for b in v.getBrands(): cx[checkreport(b.filename, None, b)] += 1 # write all stems listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w) f = open(ienputdir + '/stems.json', 'w')
else: return 0 def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) # read the CSV f = open('scrap-committees/scraped-by-grammarware.csv', 'r') # CBSE;2001;Heinz;Schmidt;;Organising Committee for line in f.readlines(): vs = line.strip().split(';') if len(vs) == 0: continue v = vs[0] + '-' + vs[1] n = vs[2] + ' ' + vs[3] # normalise! if n in renameto.keys(): print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n])
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # bad variants for bad in unfoldName: for key in wheretolook: if o.get(key) == bad: o.json[key] = unfoldName[bad] # contractions for short in short2long: for key in wheretolook: if o.get(key) == short: o.json[key] = short2long[short] if o.get(key) == short2long[short]: o.json[key+'short'] = short # a heuristic contraction for conference names if o.get('type') == 'inproceedings' \ and 'booktitleshort' not in o.json.keys() \ and 'booktitle' in o.up().json.keys() \ and len(o.get('booktitle')) > len(o.up().get('booktitle')): o.json['booktitleshort'] = o.up().get('booktitle') # a heuristic expansion of conference names # if o.get('type') == 'proceedings' \ # and 'booktitleshort' not in o.json.keys() \ # and 'booktitle' in o.up().json.keys() \ # and len(o.get('booktitle')) > len(o.up().get('booktitle')): # o.json['booktitleshort'] = o.up().get('booktitle') # remove faulty series: journal wins if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'): del o.json['series'] # *short legacy while no longer version present for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]: del o.json[key] # Springer name change if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys(): if int(o.get('year')) < 2002: o.json['publisher'] = 'Springer-Verlag' o.json['publishershort'] = 'Springer' else: o.json['publisher'] = 'Springer International Publishing' o.json['publishershort'] = 'Springer' for key in wheretolook: if key not in o.json: continue val = o.get(key) # ends with a dot if val.endswith('.'): o.json[key] = o.json[key][:-1] continue # suspiciousness if val.find('.') > -1: problem = True for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \ ' Inc. ', 'WG2.8'): if val.find(ok) > -1: problem = False break if problem: report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”') # superfluousness if key+'short' in o.json.keys() and val == o.get(key+'short'): del o.json[key+'short'] nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = lastSlash(outputdir) + '-' + paperAuths[0].split( ' ')[-1] for a in paperAuths[1:]: print(a) paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr( ord(paperFilename[-1]) + 1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir + '/' + paperFilename + '.json', 'w', encoding='utf-8') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
def report(fn1, fn2, r): statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2)) return r
return 2 else: return 0 def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) aka = parseJSON(ienputdir + '/aliases.json') CX = sum([len(aka[a]) for a in aka]) # self-adaptation heuristic: # if a manual rule does the same as the other heuristic, it’s dumb for a in sorted(aka.keys()): if len(aka[a]) == 1 and aka[a][0] in (nodiaLatin(a), simpleLatin(a)): print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing was unnecessary manual work') elif len(aka[a]) == 2 and (aka[a] == [nodiaLatin(a), simpleLatin(a)] \ or aka[a] == [simpleLatin(a), nodiaLatin(a)]): print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing was a lot of unnecessary manual work') elif nodiaLatin(a) in aka[a] or simpleLatin(a) in aka[a]: print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing contains some unnecessary manual work')
from fancy.ANSI import C from fancy.Templates import wordlistHTML, wordHTML from lib.AST import Sleigh, escape from lib.JSON import parseJSON from lib.NLP import ifApproved from collections import Counter ienputdir = '../json' outputdir = '../frontend' n2f_name = '_name2file.json' name2file = parseJSON(n2f_name) if os.path.exists(n2f_name) else {} sleigh = Sleigh(ienputdir + '/corpus', name2file) if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) stems = sleigh.getStems() tagged = [] for k in stems.keys(): f = open('{}/word/{}.html'.format(outputdir, k), 'w', encoding='utf-8') # papers are displayed in reverse chronological order lst = [x.getIItem() for x in \ sorted(stems[k], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)] # collect other stems # NB: do not use the following code, slows everything down from 1 minute to 161 minutes # allstems = [] # for x in stems[k]: # allstems += x.getBareStems()
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for k in o.json.keys(): if 'type' not in o.json.keys(): print('TERRIBLE', o.getKey()) if (o.json['type'] == 'proceedings' and k == 'title') or\ (o.json['type'] == 'inproceedings' and k == 'booktitle'): # fix numbers for nr in nrs.keys(): if o.json[k].find(' ' + nr + ' ') > -1: o.json[k] = o.json[k].replace(' ' + nr + ' ', ' ' + nrs[nr] + ' ') if isinstance(o.json[k], str): # add emdashes for fancier titles if k in ('title', 'booktitle'): o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ') # Nice heuristic to run from time to time, but reports too much # on stuff like “eXtreme” and “jPET” # if o.json[k][0].islower(): # print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title'))) # normalised pages if k == 'pages': o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace( '−', '-') # double spaces if o.json[k].find(' ') > -1: o.json[k] = o.json[k].replace(' ', ' ').strip() # find numeric values, turn them into proper integers if o.json[k].isdigit(): o.json[k] = int(o.json[k]) continue # remove confix curlies elif o.json[k].startswith('{') and o.json[k].endswith('}'): o.json[k] = o.json[k][1:-1] # single quotes to double quotes elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1: o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ') elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"): o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"') elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"): o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"') # fancify bland quotes elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1: o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ') elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'): o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”') elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'): o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“') # fancify LaTeX quotes elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1: o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “') elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"): o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “') elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'): o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“') elif o.json[k].startswith('``') and o.json[k].endswith("''"): o.json[k] = '“' + o.json[k][2:-2] + '”' # plural possessive elif o.json[k].find("'s") > -1: o.json[k] = o.json[k].replace("'s", '’s') elif o.json[k].find("s' ") > -1: o.json[k] = o.json[k].replace("s'", 's’') # contractions elif o.json[k].find("n't") > -1: o.json[k] = o.json[k].replace("n't", 'n’t') # the case of "Jr" vs "Jr." if k in ('author', 'editor') and o.json[k].endswith('Jr'): o.json[k] += '.' # TODO: report remaining suspicious activity for c in '`"\'': # ’ is ok if c in o.json[k] and k not in ('author', 'editor'): print('[ {} ] {}: {} is “{}”'.format( C.red('LOOK'), o.getKey(), k, o.json[k])) lookat.append(o.filename) elif isinstance(o.json[k], list): # inline trivial lists if len(o.json[k]) == 1: o.json[k] = o.json[k][0] # inline hidden trivial lists if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \ and k not in ('stemmed', 'tag', 'tagged'): o.json[k] = o.json[k][0] # unless it’s 'tagged' if k == 'tagged' and not isinstance(o.json[k][0], list): o.json[k] = [o.json[k]] # remove DBLP disambiguation: we might later regret it # but the information can be always re-retrieved if k in ('author', 'editor'): nas = [] for a in o.json[k]: # double spaces if a.find(' ') > -1: a = a.replace(' ', ' ').strip() ws = a.split(' ') if ws[-1].isdigit(): ws = ws[:-1] nas.append(' '.join(ws)) o.json[k] = nas # the case of "Jr" vs "Jr." o.json[k] = [ a + '.' if a.endswith(' Jr') else a for a in o.json[k] ] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
y = v v = 'http://'+v else: y = v.replace('http://', '').replace('https://', '') r = '<a href="{0}">{1}</a>'.format(v, y) elif k == 'aka': ico = '' r = '<br/>'.join(['a.k.a.: “{}”'.format(z) for z in listify(v)]) else: ico = '' r = '?{}?{}?'.format(k, v) return ico + ' ' + r + '<br/>' if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) ts = sleigh.getTags() tagged = [] for key in ts.keys(): f = open('{}/tag/{}.html'.format(outputdir, key), 'w') # papers are displayed in reverse chronological order lst = [x.getRestrictedItem(key) for x in \ sorted(ts[key], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)] # no comprehension possible for this case for x in ts[key]: if x not in tagged: tagged.append(x) # read tag definition
CONFZ = {\ 'FOSE': 'FoSE', 'MODELS': 'MoDELS'\ } BLANK = ' ' lines = [] cur = '' for fn in sys.argv[1:-1]: if cur != fn.split('-')[0]: if cur != '': print() name = fn.split('-')[0].upper() if name in CONFZ: name = CONFZ[name] print('[{}]'.format(C.green(name)), end=': ') cur = fn.split('-')[0] print("'{}".format(fn.split('-')[-1][-6:-4]), end=' ') f = open(fn, 'r', encoding='utf-8') lines += [(fn, line[:10], line[10:].strip()) for line in f.readlines()\ if line.strip() \ and line[:10] != ' ' \ and not line.startswith('##########')] f.close() print() succ = fail = 0 males = set( line.strip() for line in open('../naming/male.txt', 'r', encoding='utf-8').readlines()) femes = set(line.strip() for line in open(
ienputdir = '../json' verbose = False def report(fn1, fn2, r): statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2)) return r if __name__ == "__main__": print('{} conference renamer\n{}'.format(\ C.purple('BibSLEIGH'), C.purple('='*42))) if len(sys.argv) < 2: print('Usage:\n\t{} [<DIR>]'.format(sys.argv[0])) sys.exit(1) verbose = sys.argv[-1] == '-v' if sys.argv[1].startswith(ienputdir): path = sys.argv[1] name = path.replace(ienputdir + '/corpus/', '') namem = lastSlash(name) else: name = sys.argv[1] path = ienputdir + '/corpus/' + name namem = lastSlash(name) cx = {0: 0, 1: 0, 2: 0} if not os.path.exists(path):
def report(s, r): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], simpleLatin(s))) return r
statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], simpleLatin(s))) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' # All known contributors cx = {0: 0, 1: 0, 2: 0} people = {} for fn in glob.glob(ienputdir + '/people/*.json'): p = parseJSON(fn) if p['name'] in people.keys(): cx[report(C.red('duplicate') + ' ' + C.yellow(p), 1)] += 1 continue people[p['name']] = p print('{}: {} venues, {} papers written by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(people)), C.purple('='*42))) # traverse ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: if 'author' in p.json.keys(): for a in listify(p.json['author']): if a in people.keys():
else: return 0 def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": if len(sys.argv) < 2: print(C.purple('BibSLEIGH'), sys.argv[0], 'requires a limit to work.') sys.exit(1) verbose = sys.argv[-1] == '-v' d2r = sys.argv[1] print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\
def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) # read the CSV f = open('scrap-committees/scraped-by-grammarware.csv', 'r', encoding='utf-8') # CBSE;2001;Heinz;Schmidt;;Organising Committee for line in f.readlines(): vs = line.strip().split(';') if len(vs) == 0: continue v = vs[0] + '-' + vs[1] n = vs[2] + ' ' + vs[3] # normalise!
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country) - 1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town) - 1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town) - 1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town) - 1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town) - 1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state) - 1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state == '?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1
def checkon(fn, o): if 'dblpkey' not in o.json.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry')) return 1 mykey = o.get('dblpkey') # for the rare case of multiple dblpkeys # (can happen as a DBLP error or when same proceedings span over multiple volumes) if isinstance(mykey, list): mykey = mykey[0] if mykey not in procs.keys(): print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump')) return 1 title = procs[mykey] if title.endswith('.'): title = title[:-1] ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ') country = findOneIn(knownCountries, ws) state = findOneIn(usaStateNames, ws) found = False if country: town = ws[ws.index(country)-1] state = '?' # what if "town" is an USA state? (full) if country == 'USA' and town in usaStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is an USA state? (abbreviated) if country == 'USA' and town in usaStateAB: state = usaStateNames[usaStateAB.index(town)] town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (full) if country == 'Canada' and town in canStateNames: state = town town = ws[ws.index(town)-1] # what if "town" is a Canadian state? (abbreviated) if country == 'Canada' and town in canStateAB: state = canStateNames[canStateAB.index(town)] town = ws[ws.index(town)-1] # the same can happen in the UK if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'): state = town town = ws[ws.index(town)-1] # Georgia the country vs Georgia the state if country == 'Georgia' and town == 'Atlanta': state = country country = 'USA' # near Something if town.startswith('near '): town = ws[ws.index(town)-1] # Luxembourg, Luxembourg if country == 'Luxembourg': town = 'Luxembourg' # Saint-Malo / St. Malo if country == 'France' and town == 'St. Malo': town = 'Saint-Malo' # Florence / Firenze if country == 'Italy' and town.find('Firenze') > -1: town = 'Florence' found = True elif state: country = 'USA' town = ws[ws.index(state)-1] found = True else: # desperate times for sol in desperateSolutions.keys(): if sol in ws: town, state, country = desperateSolutions[sol] found = True # normalise if country in countryMap.keys(): country = countryMap[country] if country == 'United Kingdom' and state == '?': if town.endswith('London') or town in ('Birmingham', 'York',\ 'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\ 'Southampton', 'Norwich', 'Leicester', 'Canterbury'): state = 'England' elif town in ('Edinburgh', 'Glasgow'): state = 'Scotland' # report if 'address' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address'))) if 'location' in o.json.keys(): print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location'))) if found: # print('[ {} ] {}'.format(C.blue('KNOW'), country)) print('[ {} ] {}'.format(C.blue('AD||'), title)) print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country))) # TODO: perhaps later we can act more aggressively newaddr = [town, '' if state=='?' else state, country] if 'address' not in o.json.keys() or newaddr != o.json['address']: o.json['address'] = newaddr f = open(o.json['FILE'], 'w') f.write(o.getJSON()) f.close() return 2 # nothing changed return 0 print('[ {} ] {}'.format(C.yellow('AD??'), title)) return 1
if n in name2file: return '<a href="{}">{}</a>'.format(name2file[n], shorten(n)) else: return n def pad(n): X = str(n) while len(X) < 4: X = '0' + X return X if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) ps = [] # flatten the sleigh bykey = {} for v in sleigh.venues: bykey[v.getKey()] = v for c in v.getConfs(): bykey[c.getKey()] = c for p in c.papers: bykey[p.getKey()] = p print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey)))) # tagged = [] # for k in ts.keys():
import sys, os.path, glob from fancy.ANSI import C ienputdir = '../json' verbose = False def report(fn1, fn2, r): statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME')) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2)) return r if __name__ == "__main__": print('{} conference renamer\n{}'.format(\ C.purple('BibSLEIGH'), C.purple('='*42))) if len(sys.argv) < 2: print('Usage:\n\t{} [<DIR>]'.format(sys.argv[0])) sys.exit(1) verbose = sys.argv[-1] == '-v' if sys.argv[1].startswith(ienputdir): path = sys.argv[1] name = path.replace(ienputdir + '/corpus/', '') namem = name.split('/')[-1] else: name = sys.argv[1] path = ienputdir + '/corpus/' + name namem = name.split('/')[-1] cx = {0: 0, 1: 0, 2: 0} if not os.path.exists(path):
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() flines = json2lines(lines) plines = sorted(json2lines(o.getJSON().split('\n'))) # bad variants for bad in unfoldName: for key in wheretolook: if o.get(key) == bad: o.json[key] = unfoldName[bad] # contractions for short in short2long: for key in wheretolook: if o.get(key) == short: o.json[key] = short2long[short] if o.get(key) == short2long[short]: o.json[key+'short'] = short # a heuristic contraction for conference names if o.get('type') == 'inproceedings' \ and 'booktitleshort' not in o.json.keys() \ and 'booktitle' in o.up().json.keys() \ and len(o.get('booktitle')) > len(o.up().get('booktitle')): o.json['booktitleshort'] = o.up().get('booktitle') # a heuristic expansion of conference names # if o.get('type') == 'proceedings' \ # and 'booktitleshort' not in o.json.keys() \ # and 'booktitle' in o.up().json.keys() \ # and len(o.get('booktitle')) > len(o.up().get('booktitle')): # o.json['booktitleshort'] = o.up().get('booktitle') # remove faulty series: journal wins if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'): del o.json['series'] # *short legacy while no longer version present for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]: del o.json[key] # Springer name change if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys(): if int(o.get('year')) < 2002: o.json['publisher'] = 'Springer-Verlag' o.json['publishershort'] = 'Springer' else: o.json['publisher'] = 'Springer International Publishing' o.json['publishershort'] = 'Springer' for key in wheretolook: if key not in o.json: continue val = o.get(key) # ends with a dot if val.endswith('.'): o.json[key] = o.json[key][:-1] continue # suspiciousness if val.find('.') > -1: problem = True for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \ ' Inc. ', 'WG2.8'): if val.find(ok) > -1: problem = False break if problem: report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”') # superfluousness if key+'short' in o.json.keys() and val == o.get(key+'short'): del o.json[key+'short'] nlines = sorted(json2lines(o.getJSON().split('\n'))) if flines != plines: return 1 elif plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
paperAuths = paperAuths[:-1] paperAuths.extend(auths) paperLnk = li.get('id') hope = li.find_all('a') if hope and hope[0].get('href').endswith('.pdf'): paperPdf = urlstart + hope[0].get('href') else: paperPdf = '' paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\ 'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\ 'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\ 'author': paperAuths, 'pages': paperPages, 'venue': volVenue} if paperPdf: paperEntry['openpdf'] = paperPdf if paperLnk: paperEntry['url'] = urlstart + '#' + paperLnk paperFilename = outputdir.split('/')[-1] + '-' + paperAuths[0].split(' ')[-1] for a in paperAuths[1:]: paperFilename += a.split(' ')[-1][0] if paperFilename in done: paperFilename += 'a' while paperFilename in done: paperFilename = paperFilename[:-1] + chr(ord(paperFilename[-1])+1) # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json') f = open(outputdir+'/'+paperFilename+'.json', 'w') f.write(jsonify(paperEntry)) f.close() cx += 1 done.append(paperFilename) print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
return 2 else: return 0 def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.green(len(sleigh.venues)), C.green(sleigh.numOfPapers()), C.purple('='*42))) aka = parseJSON(ienputdir + '/aliases.json') CX = sum([len(aka[a]) for a in aka]) # self-adaptation heuristic: # if a manual rule does the same as the other heuristic, it’s dumb for a in sorted(aka.keys()): if len(aka[a]) == 1 and aka[a][0] in (nodiaLatin(a), simpleLatin(a)): print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing was unnecessary manual work') elif len(aka[a]) == 2 and (aka[a] == [nodiaLatin(a), simpleLatin(a)] \ or aka[a] == [simpleLatin(a), nodiaLatin(a)]): print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing was a lot of unnecessary manual work') elif nodiaLatin(a) in aka[a] or simpleLatin(a) in aka[a]: print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing contains some unnecessary manual work')
def report(one, two): print('[ {} ] {}'.format(one, two)) def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: report(statuses[r], fn) return r if __name__ == "__main__": if len(sys.argv) > 1: verbose = sys.argv[1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1])))
cx[1] += 1 return dblpLatin(s)+':' ws = s.split(' ') i = -1 if ws[i] in ('Jr', 'Jr.'): i -= 1 sur = dblpLatin(' '.join(ws[i:])) rest = dblpLatin(' '.join(ws[:i])).replace(' ', '_') for c in ".'-": rest = rest.replace(c, '=') return sur+':'+rest if __name__ == "__main__": verbose = sys.argv[-1] == '-v' if not os.path.exists('_renameto.json'): print('Run', C.blue('refine-aliases.py'), 'to build the aliasing/renaming relation and cache it.') sys.exit(1) # aka = parseJSON(ienputdir + '/aliases.json') dis = parseJSON(ienputdir + '/disambig.json') renameto = parseJSON('_renameto.json') # Data from the conferenceMetrics repo csv = [] f = open('../conferenceMetrics/data/SE-conf-roles.csv', 'r') for line in f.readlines(): # Conference;Year;First Name;Last Name;Sex;Role csv.append(line.strip().split(';')) f.close() f = open('scrap-committees/scraped-by-grammarware.csv', 'r') for line in f.readlines(): csv.append(line.strip().split(';')) f.close()
pcx += len(plst) ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>' elif isinstance(evals, list) and isinstance(evals[0], str): plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages) pcx += len(plst) ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>' elif isinstance(evals, list) and isinstance(evals[0], dict): ptxt = processSortedRel(evals) else: print(C.red('ERROR:'), 'unrecornised bundle structure', evals) acc.append('<dl><dt>{}{}</dt><dd>{}</dl>'.format(img, ename, ptxt)) return '\n'.join(acc) if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) bundles = {} for b in glob.glob(ienputdir + '/bundles/*.json'): purename = b.split('/')[-1][:-5] bun = json.load(open(b, 'r')) prevcx = pcx uberlist = '<h2>{1} papers</h2>{0}'.format(processSortedRel(bun['contents']), pcx-prevcx) f = open(outputdir + '/bundle/' + purename + '.html', 'w') f.write(bunHTML.format(\ title=purename+' bundle', bundle=bun['name'], ebundle=escape(purename), dl=uberlist.replace('href="', 'href="../').replace('../mailto', 'mailto')))
statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) if isinstance(o, int): r = o else: r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]), C.red(cx[1]))) print(C.red('{} files to check manually!'.format(len(warnings))))
return 0 def checkreport(fn, o): statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD')) r = checkon(fn, o) # non-verbose mode by default if verbose or r != 0: print('[ {} ] {}'.format(statuses[r], fn)) return r if __name__ == "__main__": verbose = sys.argv[-1] == '-v' print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} for v in sleigh.venues: # tags per venue for c in v.getConfs(): cx[checkreport(c.filename, c)] += 1 for b in v.brands: cx[checkreport(b.filename, b)] += 1 cx[checkreport(v.filename, v)] += 1 print('{} files checked, {} ok, {} fixed, {} failed'.format(\ C.bold(cx[0] + cx[1] + cx[2]), C.blue(cx[0]), C.yellow(cx[2]),
def linkto(n): if n in name2file: return '<a href="{}">{}</a>'.format(name2file[n], shorten(n)) else: return n def pad(n): X = str(n) while len(X) < 4: X = '0' + X return X if __name__ == "__main__": print('{}: {} venues, {} papers\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.purple('='*42))) ps = [] # flatten the sleigh bykey = {} for v in sleigh.venues: bykey[v.getKey()] = v for c in v.getConfs(): bykey[c.getKey()] = c for p in c.papers: bykey[p.getKey()] = p print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey)))) # tagged = [] # for k in ts.keys():
print('[ {} ] {}'.format(statuses[r], fn)) return r def two(n): if n < 10: return '0{}'.format(n) else: return '{}'.format(n) if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # stem ALL the papers! for v in sleigh.venues: for c in v.getConfs(): for p in c.papers: cx[checkreport(p.filename, p, None)] += 1 for b in v.getBrands(): cx[checkreport(b.filename, None, b)] += 1 # write all stems listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w)
def sdistance(x1, x2): return str(distance(x1, x2)).replace('.', ',') def distance(x1, x2): return sqrt(sum([(x1[jj] - x2[jj])**2 for jj in range(0, len(x1))])) # NB: some clustering/visualisation code based on http://brandonrose.org/clustering if __name__ == "__main__": verbose = sys.argv[-1] == '-v' peoplez = glob.glob(ienputdir + '/people/*.json') print('{}: {} venues, {} papers by {} people\n{}'.format(\ C.purple('BibSLEIGH'), C.red(len(sleigh.venues)), C.red(sleigh.numOfPapers()), C.red(len(peoplez)), C.purple('='*42))) cx = {0: 0, 1: 0, 2: 0} # we need to know all the words we have UberDict = set() UberCols = set() vocs = {b.getKey():b.json['vocabulary'] \ for v in sleigh.venues \ for b in v.getBrands() \ if 'vocabulary' in b.json \ if len(b.json['vocabulary']) > 10} for vkey in vocs: UberDict.update(vocs[vkey].keys())
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for k in o.json.keys(): if 'type' not in o.json.keys(): print('TERRIBLE',o.getKey()) if (o.json['type'] == 'proceedings' and k == 'title') or\ (o.json['type'] == 'inproceedings' and k == 'booktitle'): # fix numbers for nr in nrs.keys(): if o.json[k].find(' '+nr+' ') > -1: o.json[k] = o.json[k].replace(' '+nr+' ', ' '+nrs[nr]+' ') if isinstance(o.json[k], str): # add emdashes for fancier titles if k in ('title', 'booktitle'): o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ') # Nice heuristic to run from time to time, but reports too much # on stuff like “eXtreme” and “jPET” # if o.json[k][0].islower(): # print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title'))) # normalised pages if k == 'pages': o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace('−', '-') # double spaces if o.json[k].find(' ') > -1: o.json[k] = o.json[k].replace(' ', ' ').strip() # find numeric values, turn them into proper integers if o.json[k].isdigit(): o.json[k] = int(o.json[k]) continue # remove confix curlies elif o.json[k].startswith('{') and o.json[k].endswith('}'): o.json[k] = o.json[k][1:-1] # single quotes to double quotes elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1: o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ') elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"): o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"') elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"): o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"') # fancify bland quotes elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1: o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ') elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'): o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”') elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'): o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“') # fancify LaTeX quotes elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1: o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “') elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"): o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “') elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'): o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“') elif o.json[k].startswith('``') and o.json[k].endswith("''"): o.json[k] = '“' + o.json[k][2:-2] + '”' # plural possessive elif o.json[k].find("'s") > -1: o.json[k] = o.json[k].replace("'s", '’s') elif o.json[k].find("s' ") > -1: o.json[k] = o.json[k].replace("s'", 's’') # contractions elif o.json[k].find("n't") > -1: o.json[k] = o.json[k].replace("n't", 'n’t') # the case of "Jr" vs "Jr." if k in ('author', 'editor') and o.json[k].endswith('Jr'): o.json[k] += '.' # TODO: report remaining suspicious activity for c in '`"\'': # ’ is ok if c in o.json[k] and k not in ('author', 'editor'): print('[ {} ] {}: {} is “{}”'.format(C.red('LOOK'), o.getKey(), k, o.json[k])) lookat.append(o.filename) elif isinstance(o.json[k], list): # inline trivial lists if len(o.json[k]) == 1: o.json[k] = o.json[k][0] # inline hidden trivial lists if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \ and k not in ('stemmed', 'tag', 'tagged'): o.json[k] = o.json[k][0] # unless it’s 'tagged' if k == 'tagged' and not isinstance(o.json[k][0], list): o.json[k] = [o.json[k]] # remove DBLP disambiguation: we might later regret it # but the information can be always re-retrieved if k in ('author', 'editor'): nas = [] for a in o.json[k]: # double spaces if a.find(' ') > -1: a = a.replace(' ', ' ').strip() ws = a.split(' ') if ws[-1].isdigit(): ws = ws[:-1] nas.append(' '.join(ws)) o.json[k] = nas # the case of "Jr" vs "Jr." o.json[k] = [a+'.' if a.endswith(' Jr') else a for a in o.json[k]] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0