def report(fn, r):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX'))
    special = ('', '- no crossref found!', '- illegal crossref')
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {} {}'.format(statuses[r], fn, special[r]))
    return r
def report(fn, r):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX'))
	special = ('', '- no crossref found!', '- illegal crossref')
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {} {}'.format(statuses[r], fn, special[r]))
	return r
def checkreport(m, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(m, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], o.filename))
	return r
Exemple #4
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if not os.path.exists(fn):
		# if it still does not exist, let us create a minimal one
		f = open(fn, 'w', encoding='utf-8')
		f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\
			name=lastSlash(fn)[:-5].replace('-', ' '),
			year=findYear(lastSlash(fn))\
		))
		f.close()
		print('[ {} ] {}'.format(C.yellow('MADE'), fn))
		return 2
	f = open(fn, 'r', encoding='utf-8')
	lines = f.readlines()[1:-1]
	f.close()
	for line in lines:
		if line.find('"year"') > -1 and findYear(line) > 3000:
			os.remove(fn)
			print('[ {} ] {}'.format(C.red('KILL'), fn))
			return 1
	flines = sorted([strictstrip(s) for s in lines])
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if flines != plines:
		f1 = [line for line in flines if line not in plines]
		f2 = [line for line in plines if line not in flines]
		print('∆:', f1, '\nvs', f2)
	if flines == plines:
		return 0
	else:
		return 1
Exemple #5
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		report(statuses[r], fn)
	return r
Exemple #6
0
def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r
Exemple #7
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if not os.path.exists(fn):
		# if it still does not exist, let us create a minimal one
		f = open(fn, 'w')
		f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\
			name=fn.split('/')[-1][:-5].replace('-', ' '),
			year=findYear(fn.split('/')[-1])\
		))
		f.close()
		print('[ {} ] {}'.format(C.yellow('MADE'), fn))
		return 2
	f = open(fn, 'r')
	lines = f.readlines()[1:-1]
	f.close()
	for line in lines:
		if line.find('"year"') > -1 and findYear(line) > 3000:
			os.remove(fn)
			print('[ {} ] {}'.format(C.red('KILL'), fn))
			return 1
	flines = sorted([strictstrip(s) for s in lines])
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if flines != plines:
		f1 = [line for line in flines if line not in plines]
		f2 = [line for line in plines if line not in flines]
		print('∆:', f1, '\nvs', f2)
	if flines == plines:
		return 0
	else:
		return 1
Exemple #8
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('WARN'))
	r, msg = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}: {}'.format(statuses[r], fn, msg))
	return r
Exemple #9
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	if isinstance(o, int):
		r = o
	else:
		r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r
Exemple #10
0
def checkreport(fn, o, br):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	if br:
		r = checkbrand(fn, br)
	else:
		r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r
Exemple #11
0
def checkon(m, o):
	# if no common model found, we failed
	if not m:
		return 1
	if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'):
		m['type'] = 'proceedings'
	if 'type' in m.keys() and m['type'] == 'incollection':
		m['type'] = 'book'
	if 'crossref' in m.keys():
		del m['crossref']
	if 'booktitle' in m.keys():
		m['title'] = m['booktitle']
		del m['booktitle']
	if 'booktitleshort' in m.keys():
		# TODO: ???
		del m['booktitleshort']
	r = 0
	n = {}
	for k in m.keys():
		if o.get(k) == m[k]:
			if verbose:
				print(C.blue('Confirmed:  '), k, 'as', m[k])
		else:
			if verbose:
				print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k))
			v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k]
			if verbose:
				print(C.yellow('Settled for:'), v)
			n[k] = v
			r = 2
	if r == 0:
		return r
	if r == 2 and not n:
		# nothing to fix?!
		return 0
	if not os.path.exists(o.filename):
		return 0
	if os.path.isdir(o.filename):
		fn = o.filename + '.json'
	else:
		fn = o.filename
	if os.path.exists(fn):
		f = open(fn, 'r', encoding='utf-8')
		lines = f.read()
		f.close()
		if lines != o.getJSON():
			# strange, should be equal (run all normalisers first!)
			return 1
	for k in n.keys():
		o.json[k] = n[k]
	f = open(fn, 'w', encoding='utf-8')
	f.write(o.getJSON())
	f.close()
	return 2
def checkon(m, o):
	# if no common model found, we failed
	if not m:
		return 1
	if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'):
		m['type'] = 'proceedings'
	if 'type' in m.keys() and m['type'] == 'incollection':
		m['type'] = 'book'
	if 'crossref' in m.keys():
		del m['crossref']
	if 'booktitle' in m.keys():
		m['title'] = m['booktitle']
		del m['booktitle']
	if 'booktitleshort' in m.keys():
		# TODO: ???
		del m['booktitleshort']
	r = 0
	n = {}
	for k in m.keys():
		if o.get(k) == m[k]:
			if verbose:
				print(C.blue('Confirmed:  '), k, 'as', m[k])
		else:
			if verbose:
				print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k))
			v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k]
			if verbose:
				print(C.yellow('Settled for:'), v)
			n[k] = v
			r = 2
	if r == 0:
		return r
	if r == 2 and not n:
		# nothing to fix?!
		return 0
	if not os.path.exists(o.filename):
		return 0
	if os.path.isdir(o.filename):
		fn = o.filename + '.json'
	else:
		fn = o.filename
	if os.path.exists(fn):
		f = open(fn, 'r')
		lines = f.read()
		f.close()
		if lines != o.getJSON():
			# strange, should be equal (run all normalisers first!)
			return 1
	for k in n.keys():
		o.json[k] = n[k]
	f = open(fn, 'w')
	f.write(o.getJSON())
	f.close()
	return 2
Exemple #13
0
		title = tagdef['namefull'] if 'namefull' in tagdef.keys() else tagdef['name']
		subt = ('<br/><em>'+tagdef['namelong']+'</em>') if 'namelong' in tagdef.keys() else ''
		links = '<strong>{}</strong>{}<hr/>'.format(title, subt) + '\n'.join(sorted(links))
		dl = '<dl class="toc">' + '\n'.join(lst) + '</dl>'
		# hack to get from tags to papers
		dl = dl.replace('href="', 'href="../')
		f.write(tagHTML.format(\
			title=key+' tag',
			etag=escape(key),
			tag=key,
			above='',
			boxlinks=links,
			listname='{} papers'.format(len(lst)),
			dl=dl))
		f.close()
	print('Tag pages:', C.yellow('{}'.format(len(ts))), C.blue('generated'))
	# tag index
	f = open(outputdir+'/tag/index.html', 'w')
	keyz = [q for q in ts.keys() if len(ts[q]) > 2]
	keyz.sort(key=lambda t: len(ts[t]), reverse=True)
	lst = ['<li>#<a href="{}.html">{}</a> ({})</li>'.format(escape(t), t, len(ts[t])) for t in keyz]
	ul = '<ul class="tri mul">' + '\n'.join(lst) + '</ul>'
	CX = sum([len(ts[t]) for t in ts.keys()])
	f.write(taglistHTML.format(\
		title='All known tags',
		listname='{} tags known from {} markings'.format(len(ts), CX),
		ul=ul))
	f.close()
	print('Tag index:', C.blue('created'))
	# untagged papers
	f = open(outputdir+'/tag/untagged.html', 'w')
Exemple #14
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	f = open(fn, 'r')
	lines = f.readlines()[1:-1]
	f.close()
	flines = json2lines(lines)
	plines = sorted(json2lines(o.getJSON().split('\n')))
	# "url" from DBLP are useless
	if 'url' in o.json.keys():
		o.json['url'] = [link.replace('https://', 'http://')\
						for link in listify(o.json['url'])\
		 				if not link.startswith('db/conf/')\
		 				and not link.startswith('db/series/')\
		 				and not link.startswith('db/books/')\
						and not link.startswith('db/journals/')]
		if not o.json['url']:
			del o.json['url']
		elif len(o.json['url']) == 1:
			o.json['url'] = o.json['url'][0]
	if 'ee' in o.json.keys() and 'doi' not in o.json.keys():
		if isinstance(o.json['ee'], list):
			if verbose:
				print(C.red('Manylink:'), o.json['ee'])
		newee = []
		for onelink in listify(o.json['ee']):
			if onelink.startswith('http://dx.doi.org/'):
				o.json['doi'] = onelink[18:]
			elif onelink.startswith('http://doi.acm.org/'):
				o.json['doi'] = onelink[19:]
			elif onelink.startswith('http://doi.ieeecomputersociety.org/'):
				o.json['doi'] = onelink[35:]
			elif onelink.startswith('http://dl.acm.org/citation.cfm?id='):
				o.json['acmid'] = onelink[34:]
			elif onelink.startswith('http://portal.acm.org/citation.cfm?id='):
				o.json['acmid'] = onelink[38:]
			elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\
			  or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='):
				o.json['ieeearid'] = onelink.split('=')[-1]
			elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\
			 and onelink.find('arnumber') > -1:
				o.json['ieeearid'] = onelink.split('arnumber=')[-1].split('&')[0]
			elif onelink.startswith('http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber='):
				o.json['ieeepuid'] = onelink.split('=')[-1]
			elif onelink.startswith('http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='):
				o.json['ieeeisid'] = onelink.split('=')[-1]
			elif onelink.startswith('http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/'):
				newee.append('http://journal.ub.tu-berlin.de/eceasst/article/view/' + onelink.split('/')[-1])
			elif onelink.endswith('.pdf') and \
			    (onelink.startswith('http://computer.org/proceedings/')\
			  or onelink.startswith('http://csdl.computer.org/')):
				# Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf
				# Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf
				# Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf
				if onelink.startswith('http://csdl'):
					cname, _, cid, mid, pid = onelink.split('/')[5:10]
				else:
					cname, cid, pid = onelink.split('/')[4:7]
					# heuristic
					if pid.startswith(cid):
						mid = pid[len(cid):len(cid)+2]
					else:
						mid = '00'
				newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\
					cname,
					o.get('year'),
					cid,
					mid,
					pid))
			else:
				if onelink.find('ieee') > -1:
					print(C.purple('IEEE'), onelink)
				if verbose:
					print(C.yellow('Missed opportunity:'), onelink)
				# nothing matches => preserve
				newee.append(onelink)
		if len(newee) == 0:
			del o.json['ee']
		elif len(newee) == 1:
			o.json['ee'] = newee[0]
		else:
			o.json['ee'] = newee
		# post-processing normalisation
		if 'acmid' in o.json.keys() and not isinstance(o.json['acmid'], int) and o.json['acmid'].isdigit():
			o.json['acmid'] = int(o.json['acmid'])
	if 'eventuri' in o.json.keys():
		o.json['eventurl'] = o.json['eventuri']
		del o.json['eventuri']
	if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith('https://'):
		o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://')
	nlines = sorted(json2lines(o.getJSON().split('\n')))
	if flines != plines:
		return 1
	elif plines != nlines:
		f = open(fn, 'w')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0
Exemple #15
0
def report(s, r):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], s))
	return r

if __name__ == "__main__":
	verbose = sys.argv[-1] == '-v'
	# All known contributors
	cx = {0: 0, 1: 0, 2: 0}
	people = {}
	for fn in glob.glob(ienputdir + '/people/*.json'):
		p = parseJSON(fn)
		if p['name'] in people.keys():
			cx[report(C.red('duplicate')+' '+C.yellow(p), 1)] += 1
			continue
		people[p['name']] = p
	print('{}: {} venues, {} papers written by {} people\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.red(len(people)),
		C.purple('='*42)))
	# traverse ALL the papers!
	for v in sleigh.venues:
		for c in v.getConfs():
			for p in c.papers:
				if 'author' in p.json.keys():
					for a in listify(p.json['author']):
						if a in people.keys():
Exemple #16
0
def report(one, two):
	print('[ {} ] {}'.format(one, two))

def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		report(statuses[r], fn)
	return r

if __name__ == "__main__":
	if len(sys.argv) > 1:
		verbose = sys.argv[1] == '-v'
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	cx = {0: 0, 1: 0, 2: 0}
	for v in sleigh.venues:
		for c in v.getConfs():
			cx[checkreport(c.filename, c)] += 1
			for p in c.papers:
				cx[checkreport(p.filename, p)] += 1
	print('{} files checked, {} ok, {} fixed, {} failed'.format(\
		C.bold(cx[0] + cx[1] + cx[2]),
		C.blue(cx[0]),
		C.yellow(cx[2]),
		C.red(cx[1])))
Exemple #17
0
			paperAuths = paperAuths[:-1]
			paperAuths.extend(auths)
		paperLnk = li.get('id')
		hope = li.find_all('a')
		if hope and hope[0].get('href').endswith('.pdf'):
			paperPdf = urlstart + hope[0].get('href')
		else:
			paperPdf = ''
		paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\
			'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\
			'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\
			'author': paperAuths, 'pages': paperPages, 'venue': volVenue}
		if paperPdf:
			paperEntry['openpdf'] = paperPdf
		if paperLnk:
			paperEntry['url'] = urlstart + '#' + paperLnk
		paperFilename = outputdir.split('/')[-1] + '-' + paperAuths[0].split(' ')[-1]
		for a in paperAuths[1:]:
			paperFilename += a.split(' ')[-1][0]
		if paperFilename in done:
			paperFilename += 'a'
			while paperFilename in done:
				paperFilename = paperFilename[:-1] + chr(ord(paperFilename[-1])+1)
		# print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json')
		f = open(outputdir+'/'+paperFilename+'.json', 'w')
		f.write(jsonify(paperEntry))
		f.close()
		cx += 1
		done.append(paperFilename)
	print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
Exemple #18
0
        else:
            paperPdf = ''
        paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\
         'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\
         'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\
         'author': paperAuths, 'pages': paperPages, 'venue': volVenue}
        if paperPdf:
            paperEntry['openpdf'] = paperPdf
        if paperLnk:
            paperEntry['url'] = urlstart + '#' + paperLnk
        paperFilename = lastSlash(outputdir) + '-' + paperAuths[0].split(
            ' ')[-1]
        for a in paperAuths[1:]:
            print(a)
            paperFilename += a.split(' ')[-1][0]
        if paperFilename in done:
            paperFilename += 'a'
            while paperFilename in done:
                paperFilename = paperFilename[:-1] + chr(
                    ord(paperFilename[-1]) + 1)
        # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json')
        f = open(outputdir + '/' + paperFilename + '.json',
                 'w',
                 encoding='utf-8')
        f.write(jsonify(paperEntry))
        f.close()
        cx += 1
        done.append(paperFilename)
    print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx),
          'papers.')
Exemple #19
0
		# 	allstems += x.getBareStems()
		# siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)}
		# NB: the following code is faster:
		siblings = Counter()
		for x in stems[k]:
			siblings.update([s for s in x.getBareStems() if s != k and ifApproved(s)])
		box = '<code>Used together with:</code><hr/>' + \
			'\n<br/>'.join(['<span class="tag"><a href="{0}.html">{0}</a></span> ({1})'.format(\
				*sn) for sn in siblings.most_common(5)])
		f.write(wordHTML.format(\
			stem=k,
			inthebox=box,
			listname='{} papers'.format(len(lst)),
			dl='<dl class="toc">' + '\n'.join(lst).replace('href="', 'href="../') + '</dl>'))
		f.close()
	print('Word pages:', C.yellow('{}'.format(len(stems))), C.blue('generated'))
	# stem index
	f = open(outputdir+'/words.html', 'w', encoding='utf-8')
	keyz = [k for k in stems.keys() if len(stems[k]) > 100 and ifApproved(k)]
	keyz.sort(key=lambda t: -len(t), reverse=True)
	lst = ['<li><a href="word/{}.html">{}</a>$ ({})</li>'.format(\
		escape(t), t, len(stems[t])) for t in keyz]
	ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>'
	CX = sum([len(stems[t]) for t in stems.keys()])
	f.write(wordlistHTML.format(\
		title='All known stems',
		listname='{} stems known and {} shown from {} notable words'.format(len(stems), len(keyz), CX),
		ul=ul))
	f.close()
	print('Stem index:', C.blue('created'))
	print('{}\nDone with {} venues, {} papers, {} tags.'.format(\
Exemple #20
0
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], simpleLatin(s)))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    # All known contributors
    cx = {0: 0, 1: 0, 2: 0}
    people = {}
    for fn in glob.glob(ienputdir + '/people/*.json'):
        p = parseJSON(fn)
        if p['name'] in people.keys():
            cx[report(C.red('duplicate') + ' ' + C.yellow(p), 1)] += 1
            continue
        people[p['name']] = p
    print('{}: {} venues, {} papers written by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(people)),
     C.purple('='*42)))
    # traverse ALL the papers!
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                if 'author' in p.json.keys():
                    for a in listify(p.json['author']):
                        if a in people.keys():
Exemple #21
0
		C.purple('='*42)))
	bundles = {}
	for b in glob.glob(ienputdir + '/bundles/*.json'):
		purename = b.split('/')[-1][:-5]
		bun = json.load(open(b, 'r'))
		prevcx = pcx
		uberlist = '<h2>{1} papers</h2>{0}'.format(processSortedRel(bun['contents']), pcx-prevcx)
		f = open(outputdir + '/bundle/' + purename + '.html', 'w')
		f.write(bunHTML.format(\
			title=purename+' bundle',
			bundle=bun['name'],
			ebundle=escape(purename),
			dl=uberlist.replace('href="', 'href="../').replace('../mailto', 'mailto')))
		f.close()
		bundles[purename] = pcx-prevcx
	print('Bundle pages:', C.yellow('{}'.format(len(bundles))), C.blue('generated'))
	# now for the index
	f = open(outputdir+'/bundle/index.html', 'w')
	lst = ['<li><a href="{}.html">{}</a> ({})</li>'.format(\
		escape(b),
		b,
		bundles[b]) for b in sorted(bundles.keys())]
	ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>'
	f.write(bunListHTML.format(\
		title='All specified bundles',
		listname='{} bundles known with {} papers'.format(len(bundles), sum(bundles.values())),
		ul='<ul class="tri">' + '\n'.join(lst) + '</ul>'))
	f.close()
	print('Bundle index:', C.blue('created'))
	print('{}\nDone with {} venues, {} papers.'.format(\
		C.purple('='*42),
Exemple #22
0
				 + ' \n'.join(['<span class="tag"><a href="../word/{0}.html">{0}</a></span> ({1})'.format(S, stems[S]) \
				 	for S in stemkeys[:10]])
			boxlinks += adds
			# combine boxlinks
			if boxlinks:
				boxlinks = '<div class="tbox">' + boxlinks + '</div>'
		f.write(personHTML.format(\
			title=k,
			gender=gender,
			boxlinks=boxlinks,
			eperson=escape(k),
			person=persondef['name'],
			# boxlinks=links
			namedlists=dls))
		f.close()
	print('Person pages:', C.yellow('{}'.format(len(ps))), C.blue('generated'))
	# person index
	# keyz = [k for k in ps.keys() if len(ts[k]) > 2]
	# keyz = sorted(keyz, key=lambda t:len(ts[t]), reverse=True)
	keyz = ps#sorted(ps.keys())
	letters = [chr(x) for x in range(ord('a'), ord('z')+1)]
	indices = {x:[] for x in letters}
	for t in keyz:
		ws = t.split('_')
		i = -1
		if ws[i] == 'Jr':
			i -= 1
		letter = ws[i][0].lower()
		if not letter.isalpha():
			print(C.red('ERROR')+':', 'wrong name', t)
			letter = ws[i-1][0].lower()
Exemple #23
0
		# 	allstems += x.getBareStems()
		# siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)}
		# NB: the following code is faster:
		siblings = Counter()
		for x in stems[k]:
			siblings.update([s for s in x.getBareStems() if s != k and ifApproved(s)])
		box = '<code>Used together with:</code><hr/>' + \
			'\n<br/>'.join(['<span class="tag"><a href="{0}.html">{0}</a></span> ({1})'.format(\
				*sn) for sn in siblings.most_common(5)])
		f.write(wordHTML.format(\
			stem=k,
			inthebox=box,
			listname='{} papers'.format(len(lst)),
			dl='<dl class="toc">' + '\n'.join(lst).replace('href="', 'href="../') + '</dl>'))
		f.close()
	print('Word pages:', C.yellow('{}'.format(len(stems))), C.blue('generated'))
	# stem index
	f = open(outputdir+'/words.html', 'w')
	keyz = [k for k in stems.keys() if len(stems[k]) > 100 and ifApproved(k)]
	keyz.sort(key=lambda t: -len(t), reverse=True)
	lst = ['<li><a href="word/{}.html">{}</a>$ ({})</li>'.format(\
		escape(t), t, len(stems[t])) for t in keyz]
	ul = '<ul class="tri">' + '\n'.join(lst) + '</ul>'
	CX = sum([len(stems[t]) for t in stems.keys()])
	f.write(wordlistHTML.format(\
		title='All known stems',
		listname='{} stems known and {} shown from {} notable words'.format(len(stems), len(keyz), CX),
		ul=ul))
	f.close()
	print('Stem index:', C.blue('created'))
	print('{}\nDone with {} venues, {} papers, {} tags.'.format(\
Exemple #24
0
  C.red(sleigh.numOfPapers()),
  C.purple('='*42)))
 # read the CSV
 f = open('scrap-committees/scraped-by-grammarware.csv',
          'r',
          encoding='utf-8')
 # CBSE;2001;Heinz;Schmidt;;Organising Committee
 for line in f.readlines():
     vs = line.strip().split(';')
     if len(vs) == 0:
         continue
     v = vs[0] + '-' + vs[1]
     n = vs[2] + ' ' + vs[3]
     # normalise!
     if n in renameto.keys():
         print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n])
         n = renameto[n]
     # sex is ignored, mostly absent anyway
     r = vs[5]
     if v not in roles.keys():
         roles[v] = []
     # NB: the next line uses lists for the sake of JSON compatibility
     roles[v].append([n, r])
 f.close()
 print('Metadata on {} editions loaded with {} role assignments'.format(
     C.red(len(roles)), C.red(sum([len(roles[k]) for k in roles.keys()]))))
 # now add
 cx = {0: 0, 1: 0, 2: 0}
 for v in sleigh.venues:
     for c in v.getConfs():
         cx[checkreport(c.filename, c)] += 1
Exemple #25
0
def report(fn1, fn2, r):
    statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
    return r
Exemple #26
0
            report(
                '{}: “{}” == “{}”?'.format(surname, variants[0], variants[1]),
                2)
        # print
        pvariants = ['“{}”'.format(v) for v in variants]
        report('{}: {}'.format(surname, ' vs '.join(pvariants)), 0)
    # write back if changed
    for k in people.keys():
        p = people[k]
        if p['FILE']:
            if os.path.exists(p['FILE']):
                cur = parseJSON(p['FILE'])
                if cur == p:
                    cx[0] += 1
                    if verbose:
                        print('[', C.green('FIXD'), ']', p['name'])
                    continue
            print('[', C.yellow('FIXD'), ']', p['name'])
            cx[2] += 1
            f = open(p['FILE'], 'w', encoding='utf-8')
            del p['FILE']
            f.write(jsonify(p))
            f.close()
        else:
            print('How can that be?')
    print('{} people checked, {} ok, {} fixed, {} failed'.format(\
     C.bold(cx[0] + cx[1] + cx[2]),
     C.blue(cx[0]),
     C.yellow(cx[2]),
     C.red(cx[1])))
Exemple #27
0
def report(s, r):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], s))
	return r
Exemple #28
0
def report(s, r):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], simpleLatin(s)))
    return r
Exemple #29
0
def checkon(fn, o):
	if 'dblpkey' not in o.json.keys():
		print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry'))
		return 1
	mykey = o.get('dblpkey')
	# for the rare case of multiple dblpkeys
	# (can happen as a DBLP error or when same proceedings span over multiple volumes)
	if isinstance(mykey, list):
		mykey = mykey[0]
	if mykey not in procs.keys():
		print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump'))
		return 1
	title = procs[mykey]
	if title.endswith('.'):
		title = title[:-1]
	ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ')
	country = findOneIn(knownCountries, ws)
	state = findOneIn(usaStateNames, ws)
	found = False
	if country:
		town = ws[ws.index(country)-1]
		state = '?'
		# what if "town" is an USA state? (full)
		if country == 'USA' and town in usaStateNames:
			state = town
			town = ws[ws.index(town)-1]
		# what if "town" is an USA state? (abbreviated)
		if country == 'USA' and town in usaStateAB:
			state = usaStateNames[usaStateAB.index(town)]
			town = ws[ws.index(town)-1]
		# what if "town" is a Canadian state? (full)
		if country == 'Canada' and town in canStateNames:
			state = town
			town = ws[ws.index(town)-1]
		# what if "town" is a Canadian state? (abbreviated)
		if country == 'Canada' and town in canStateAB:
			state = canStateNames[canStateAB.index(town)]
			town = ws[ws.index(town)-1]
		# the same can happen in the UK
		if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'):
			state = town
			town = ws[ws.index(town)-1]
		# Georgia the country vs Georgia the state
		if country == 'Georgia' and town == 'Atlanta':
			state = country
			country = 'USA'
		# near Something
		if town.startswith('near '):
			town = ws[ws.index(town)-1]
		# Luxembourg, Luxembourg
		if country == 'Luxembourg':
			town = 'Luxembourg'
		# Saint-Malo / St. Malo
		if country == 'France' and town == 'St. Malo':
			town = 'Saint-Malo'
		# Florence / Firenze
		if country == 'Italy' and town.find('Firenze') > -1:
			town = 'Florence'
		found = True
	elif state:
		country = 'USA'
		town = ws[ws.index(state)-1]
		found = True
	else:
		# desperate times
		for sol in desperateSolutions.keys():
			if sol in ws:
				town, state, country = desperateSolutions[sol]
				found = True
	# normalise
	if country in countryMap.keys():
		country = countryMap[country]
	if country == 'United Kingdom' and state == '?':
		if town.endswith('London') or town in ('Birmingham', 'York',\
		'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\
		'Southampton', 'Norwich', 'Leicester', 'Canterbury'):
			state = 'England'
		elif town in ('Edinburgh', 'Glasgow'):
			state = 'Scotland'
	# report
	if 'address' in o.json.keys():
		print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address')))
	if 'location' in o.json.keys():
		print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location')))
	if found:
		# print('[ {} ] {}'.format(C.blue('KNOW'), country))
		print('[ {} ] {}'.format(C.blue('AD||'), title))
		print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country)))
		# TODO: perhaps later we can act more aggressively
		newaddr = [town, '' if state=='?' else state, country]
		if 'address' not in o.json.keys() or newaddr != o.json['address']:
			o.json['address'] = newaddr
			f = open(o.json['FILE'], 'w')
			f.write(o.getJSON())
			f.close()
			return 2
		# nothing changed
		return 0
	print('[ {} ] {}'.format(C.yellow('AD??'), title))
	return 1
Exemple #30
0
             '</em>') if 'namelong' in tagdef.keys() else ''
     links = '<strong>{}</strong>{}<hr/>'.format(title, subt) + '\n'.join(
         sorted(links))
     dl = '<dl class="toc">' + '\n'.join(lst) + '</dl>'
     # hack to get from tags to papers
     dl = dl.replace('href="', 'href="../')
     f.write(tagHTML.format(\
      title=key+' tag',
      etag=escape(key),
      tag=key,
      above='',
      boxlinks=links,
      listname='{} papers'.format(len(lst)),
      dl=dl))
     f.close()
 print('Tag pages:', C.yellow('{}'.format(len(ts))), C.blue('generated'))
 # tag index
 f = open(outputdir + '/tag/index.html', 'w', encoding='utf-8')
 keyz = [q for q in ts.keys() if len(ts[q]) > 2]
 keyz.sort(key=lambda t: len(ts[t]), reverse=True)
 lst = [
     '<li>#<a href="{}.html">{}</a> ({})</li>'.format(
         escape(t), t, len(ts[t])) for t in keyz
 ]
 ul = '<ul class="tri mul">' + '\n'.join(lst) + '</ul>'
 CX = sum([len(ts[t]) for t in ts.keys()])
 f.write(taglistHTML.format(\
  title='All known tags',
  listname='{} tags known from {} markings'.format(len(ts), CX),
  ul=ul))
 f.close()
Exemple #31
0
def report(fn1, fn2, r):
	statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
	return r
Exemple #32
0
def main():
    print('{}: {} venues, {} papers\n{}'.format(C.purple('BibSLEIGH'),
                                                C.red(len(sleigh.venues)),
                                                C.red(sleigh.numOfPapers()),
                                                C.purple('=' * 42)))
    # generate the index
    f = open(outputdir + '/index.html', 'w', encoding='utf-8')
    f.write(sleigh.getPage())
    f.close()
    # generate all individual pages
    # if False:
    for v in sleigh.venues:
        r = C.blue(v.getKey())
        f = open(outputdir + '/' + v.getKey() + '.html', 'w', encoding='utf-8')
        f.write(v.getPage())
        f.close()
        if v.brands:
            r += '{' + '+'.join([C.blue(b.getKey()) for b in v.brands]) + '}'
            for b in v.brands:
                f = open(outputdir + '/' + b.getKey() + '.brand.html',
                         'w',
                         encoding='utf-8')
                f.write(b.getPage())
                f.close()
        r += ' => '
        for c in v.getConfs():
            f = open(outputdir + '/' + c.getKey() + '.html',
                     'w',
                     encoding='utf-8')
            f.write(c.getPage())
            f.close()
            for p in c.papers:
                f = open(outputdir + '/' + p.getKey() + '.html',
                         'w',
                         encoding='utf-8')
                f.write(p.getPage())
                f.close()
            purekey = c.getKey().replace(v.getKey(), '').replace('-',
                                                                 ' ').strip()
            r += '{} [{}], '.format(purekey, C.yellow(len(c.papers)))
        print(r)
    # generate the icon lineup
    icons = []
    linked = []
    pngs = [
        lastSlash(png).split('.')[0]
        for png in glob.glob(outputdir + '/stuff/*.png')
    ]
    pngs = [png for png in pngs \
            if not (png.startswith('a-') or png.startswith('p-') or png.startswith('ico-')
                    or png in ('cc-by', 'xhtml', 'css', 'open-knowledge', 'edit'))]
    for brand in glob.glob(outputdir + '/*.brand.html'):
        pure = lastSlash(brand).split('.')[0]
        img = pure.lower().replace(' ', '')
        if img in pngs:
            pic = '<div class="wider"><a href="{0}.brand.html"><img class="abc" src="{1}" alt="{0}"/></a><span>{0}</span></div>'.format( \
                pure,
                'stuff/' + img + '.png')
            pngs.remove(img)
            icons.append(pic)
        else:
            # print('No image for', pure)
            pass
    corner = {
        'ada': 'TRI-Ada',
        'comparch': 'CompArch',
        'floc': 'FLoC',
        'bibsleigh': 'index'
    }
    for pure in pngs:
        venueCandidate = corner[pure] if pure in corner else pure.upper()
        canlink = sorted(glob.glob(outputdir + '/' + venueCandidate +
                                   '*.html'),
                         key=len)
        if canlink:
            pic = '<div class="wider"><a href="{0}"><img class="abc" src="stuff/{1}.png" alt="{2}"/></a><span>{2}</span></div>'.format( \
                canlink[0].split('/')[-1],
                pure,
                venueCandidate,
                canlink[0].split('/')[0])
        elif pure == 'twitter':
            pic = '<div class="wider"><a href="https://about.twitter.com/company/brand-assets"><img class="abc" src="stuff/twitter.png" alt="Twitter"/></a><span>Twitter</span></div>'
        elif pure == 'email':
            pic = '<div class="wider"><a href="mailto:[email protected]"><img class="abc" src="stuff/email.png" alt="e-mail"/></a><span>email</span></div>'
        else:
            print('Lonely', pure)
            pic = '<img class="abc" src="stuff/{0}.png" alt="{0}"/>'.format(
                pure)
        icons.append(pic)
    # find last year of each venue
    # for ven in glob.glob(corpusdir + '/*'):
    # 	venname = lastSlash(ven)
    # 	newstuff += '<strong><a href="http://dblp.uni-trier.de/db/conf/{}/">{} {}</a></strong>, '.format(venname.lower(), venname, nextYear(ven))
    # print(lastSlash(ven), ':', lastYear(ven))
    # write "more info" file
    f = open(outputdir + '/about.html', 'w', encoding='utf-8')
    f.write(
        aboutHTML.format(
            len(icons),
            '<div class="minibar">' + '\n'.join(sorted(icons)) + '</div>'))
    f.close()

    # generate the DBLP sync page
    cell_by_conf_by_year = {}
    Ys = [
        2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009
    ]
    dblplinks = {}

    with open(ienputdir + '/meta/dblpguide.sync', 'r') as f:
        for line in f:
            if not line or line.startswith('#'):
                continue
            words = line.split('|')
            if len(words) != 3:
                print('- Metaline {} skipped!'.format(words))
                continue
            name = words[0].strip()
            dome = words[1].strip()
            dblp = words[2].strip()
            cell_by_conf_by_year[name] = {}
            dblplinks[name] = dblp
            for y in Ys:
                cell_by_conf_by_year[name][y] = '(no)'
            v = sleigh.getVenue(dome)
            if v:
                for yy in Ys:
                    y = v.getYear(yy)
                    if y:
                        ckey = '{}-{}'.format(name, yy)
                        c = y.getConf(ckey)
                        if c:
                            cell_by_conf_by_year[name][yy] = c.getIconItem2(
                                '', '')
                        else:
                            # print('- Conference {} of year {} in venue {} not found in the corpus'.format(ckey, yy, name))
                            for alt in 'v1', 'p1', 'c1', '1', 'J':
                                ckey = '{}-{}-{}'.format(name, alt, yy)
                                c = y.getConf(ckey)
                                if c:
                                    cell_by_conf_by_year[name][
                                        yy] = c.getIconItem2('', '')
                                    break
                # else:
                # 	print('- Year {} in venue {} not found in the corpus among {}'.format(yy, name, [z.year for z in v.years]))
        # else:
        # 	print('- Venue {} not found in the corpus'.format(name))

    table = '<table>'
    table += '<tr><td></td>'
    for y in Ys:
        table += '<th>{}</th>\n'.format(y)
    table += '</tr>'
    # print (cell_by_conf_by_year)
    for name in sorted(cell_by_conf_by_year.keys()):
        table += '<tr><th><a href="{}.brand.html">[@]</a> <a href="{}">{}</a></th>'.format(
            name, dblplinks[name], name)
        for y in Ys:
            table += '<td>{}</td>\n'.format(cell_by_conf_by_year[name][y])
        table += '</tr>'
    table += '</table>'

    with open(outputdir + '/sync.html', 'w', encoding='utf-8') as f:
        f.write(syncHTML.format(table))

    print('{}\nDone with {} venues, {} papers.'.format(
        C.purple('=' * 42), C.red(len(sleigh.venues)),
        C.red(sleigh.numOfPapers())))
Exemple #33
0
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	# read the CSV
	f = open('scrap-committees/scraped-by-grammarware.csv', 'r')
	# CBSE;2001;Heinz;Schmidt;;Organising Committee
	for line in f.readlines():
		vs = line.strip().split(';')
		if len(vs) == 0:
			continue
		v = vs[0] + '-' + vs[1]
		n = vs[2] + ' ' + vs[3]
		# normalise!
		if n in renameto.keys():
			print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n])
			n = renameto[n]
		# sex is ignored, mostly absent anyway
		r = vs[5]
		if v not in roles.keys():
			roles[v] = []
		# NB: the next line uses lists for the sake of JSON compatibility
		roles[v].append([n,r])
	f.close()
	print('Metadata on {} editions loaded with {} role assignments'.format(C.red(len(roles)), C.red(sum([len(roles[k]) for k in roles.keys()]))))
	# now add
	cx = {0: 0, 1: 0, 2: 0}
	for v in sleigh.venues:
		for c in v.getConfs():
			cx[checkreport(c.filename, c)] += 1
			for p in c.papers:
Exemple #34
0
           + ' \n'.join(['<span class="tag"><a href="../word/{0}.html">{0}</a></span> ({1})'.format(S, stems[S]) \
            for S in stemkeys[:10]])
         boxlinks += adds
         # combine boxlinks
         if boxlinks:
             boxlinks = '<div class="tbox">' + boxlinks + '</div>'
     f.write(personHTML.format(\
      title=k,
      gender=gender,
      boxlinks=boxlinks,
      eperson=escape(k),
      person=persondef['name'],
      # boxlinks=links
      namedlists=dls))
     f.close()
 print('Person pages:', C.yellow('{}'.format(len(ps))), C.blue('generated'))
 # person index
 # keyz = [k for k in ps.keys() if len(ts[k]) > 2]
 # keyz = sorted(keyz, key=lambda t:len(ts[t]), reverse=True)
 keyz = ps  #sorted(ps.keys())
 letters = [chr(x) for x in range(ord('a'), ord('z') + 1)]
 indices = {x: [] for x in letters}
 for t in keyz:
     ws = t.split('_')
     i = -1
     if ws[i] == 'Jr':
         i -= 1
     letter = ws[i][0].lower()
     if not letter.isalpha():
         print(C.red('ERROR') + ':', 'wrong name', t)
         letter = ws[i - 1][0].lower()
Exemple #35
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	f = open(fn, 'r')
	lines = f.readlines()[1:-1]
	f.close()
	flines = json2lines(lines)
	plines = sorted(json2lines(o.getJSON().split('\n')))
	# bad variants
	for bad in unfoldName:
		for key in wheretolook:
			if o.get(key) == bad:
				o.json[key] = unfoldName[bad]
	# contractions
	for short in short2long:
		for key in wheretolook:
			if o.get(key) == short:
				o.json[key] = short2long[short]
			if o.get(key) == short2long[short]:
				o.json[key+'short'] = short
	# a heuristic contraction for conference names
	if o.get('type') == 'inproceedings' \
	and 'booktitleshort' not in o.json.keys() \
	and 'booktitle' in o.up().json.keys() \
	and len(o.get('booktitle')) > len(o.up().get('booktitle')):
		o.json['booktitleshort'] = o.up().get('booktitle')
	# a heuristic expansion of conference names
	# if o.get('type') == 'proceedings' \
	# and 'booktitleshort' not in o.json.keys() \
	# and 'booktitle' in o.up().json.keys() \
	# and len(o.get('booktitle')) > len(o.up().get('booktitle')):
	# 	o.json['booktitleshort'] = o.up().get('booktitle')
	# remove faulty series: journal wins
	if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'):
		del o.json['series']
	# *short legacy while no longer version present
	for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]:
		del o.json[key]
	# Springer name change
	if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys():
		if int(o.get('year')) < 2002:
			o.json['publisher'] = 'Springer-Verlag'
			o.json['publishershort'] = 'Springer'
		else:
			o.json['publisher'] = 'Springer International Publishing'
			o.json['publishershort'] = 'Springer'
	for key in wheretolook:
		if key not in o.json:
			continue
		val = o.get(key)
		# ends with a dot
		if val.endswith('.'):
			o.json[key] = o.json[key][:-1]
			continue
		# suspiciousness
		if val.find('.') > -1:
			problem = True
			for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \
				' Inc. ', 'WG2.8'):
				if val.find(ok) > -1:
					problem = False
					break
			if problem:
				report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”')
		# superfluousness
		if key+'short' in o.json.keys() and val == o.get(key+'short'):
			del o.json[key+'short']
	nlines = sorted(json2lines(o.getJSON().split('\n')))
	if flines != plines:
		return 1
	elif plines != nlines:
		f = open(fn, 'w')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0
Exemple #36
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	f = open(fn, 'r', encoding='utf-8')
	lines = f.readlines()[1:-1]
	f.close()
	flines = json2lines(lines)
	plines = sorted(json2lines(o.getJSON().split('\n')))
	# bad variants
	for bad in unfoldName:
		for key in wheretolook:
			if o.get(key) == bad:
				o.json[key] = unfoldName[bad]
	# contractions
	for short in short2long:
		for key in wheretolook:
			if o.get(key) == short:
				o.json[key] = short2long[short]
			if o.get(key) == short2long[short]:
				o.json[key+'short'] = short
	# a heuristic contraction for conference names
	if o.get('type') == 'inproceedings' \
	and 'booktitleshort' not in o.json.keys() \
	and 'booktitle' in o.up().json.keys() \
	and len(o.get('booktitle')) > len(o.up().get('booktitle')):
		o.json['booktitleshort'] = o.up().get('booktitle')
	# a heuristic expansion of conference names
	# if o.get('type') == 'proceedings' \
	# and 'booktitleshort' not in o.json.keys() \
	# and 'booktitle' in o.up().json.keys() \
	# and len(o.get('booktitle')) > len(o.up().get('booktitle')):
	# 	o.json['booktitleshort'] = o.up().get('booktitle')
	# remove faulty series: journal wins
	if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'):
		del o.json['series']
	# *short legacy while no longer version present
	for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]:
		del o.json[key]
	# Springer name change
	if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys():
		if int(o.get('year')) < 2002:
			o.json['publisher'] = 'Springer-Verlag'
			o.json['publishershort'] = 'Springer'
		else:
			o.json['publisher'] = 'Springer International Publishing'
			o.json['publishershort'] = 'Springer'
	for key in wheretolook:
		if key not in o.json:
			continue
		val = o.get(key)
		# ends with a dot
		if val.endswith('.'):
			o.json[key] = o.json[key][:-1]
			continue
		# suspiciousness
		if val.find('.') > -1:
			problem = True
			for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \
				' Inc. ', 'WG2.8'):
				if val.find(ok) > -1:
					problem = False
					break
			if problem:
				report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”')
		# superfluousness
		if key+'short' in o.json.keys() and val == o.get(key+'short'):
			del o.json[key+'short']
	nlines = sorted(json2lines(o.getJSON().split('\n')))
	if flines != plines:
		return 1
	elif plines != nlines:
		f = open(fn, 'w', encoding='utf-8')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0
Exemple #37
0
def checkon(fn, o):
    if not os.path.exists(fn) or os.path.isdir(fn):
        fn = fn + '.json'
    f = open(fn, 'r', encoding='utf-8')
    lines = f.readlines()[1:-1]
    f.close()
    flines = json2lines(lines)
    plines = sorted(json2lines(o.getJSON().split('\n')))
    # "url" from DBLP are useless
    if 'url' in o.json.keys():
        o.json['url'] = [link.replace('https://', 'http://')\
            for link in listify(o.json['url'])\
             if not link.startswith('db/conf/')\
             and not link.startswith('db/series/')\
             and not link.startswith('db/books/')\
            and not link.startswith('db/journals/')]
        if not o.json['url']:
            del o.json['url']
        elif len(o.json['url']) == 1:
            o.json['url'] = o.json['url'][0]
    if 'ee' in o.json.keys() and 'doi' not in o.json.keys():
        if isinstance(o.json['ee'], list):
            if verbose:
                print(C.red('Manylink:'), o.json['ee'])
        newee = []
        for onelink in listify(o.json['ee']):
            if onelink.startswith('http://dx.doi.org/'):
                o.json['doi'] = onelink[18:]
            elif onelink.startswith('http://doi.acm.org/'):
                o.json['doi'] = onelink[19:]
            elif onelink.startswith('http://doi.ieeecomputersociety.org/'):
                o.json['doi'] = onelink[35:]
            elif onelink.startswith('http://dl.acm.org/citation.cfm?id='):
                o.json['acmid'] = onelink[34:]
            elif onelink.startswith('http://portal.acm.org/citation.cfm?id='):
                o.json['acmid'] = onelink[38:]
            elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\
              or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='):
                o.json['ieeearid'] = onelink.split('=')[-1]
            elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\
             and onelink.find('arnumber') > -1:
                o.json['ieeearid'] = onelink.split('arnumber=')[-1].split(
                    '&')[0]
            elif onelink.startswith(
                    'http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber='
            ):
                o.json['ieeepuid'] = onelink.split('=')[-1]
            elif onelink.startswith(
                    'http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='):
                o.json['ieeeisid'] = onelink.split('=')[-1]
            elif onelink.startswith(
                    'http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/'
            ):
                newee.append(
                    'http://journal.ub.tu-berlin.de/eceasst/article/view/' +
                    onelink.split('/')[-1])
            elif onelink.endswith('.pdf') and \
                (onelink.startswith('http://computer.org/proceedings/')\
              or onelink.startswith('http://csdl.computer.org/')):
                # Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf
                # Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf
                # Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf
                if onelink.startswith('http://csdl'):
                    cname, _, cid, mid, pid = onelink.split('/')[5:10]
                else:
                    cname, cid, pid = onelink.split('/')[4:7]
                    # heuristic
                    if pid.startswith(cid):
                        mid = pid[len(cid):len(cid) + 2]
                    else:
                        mid = '00'
                newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\
                 cname,
                 o.get('year'),
                 cid,
                 mid,
                 pid))
            else:
                if onelink.find('ieee') > -1:
                    print(C.purple('IEEE'), onelink)
                if verbose:
                    print(C.yellow('Missed opportunity:'), onelink)
                # nothing matches => preserve
                newee.append(onelink)
        if len(newee) == 0:
            del o.json['ee']
        elif len(newee) == 1:
            o.json['ee'] = newee[0]
        else:
            o.json['ee'] = newee
        # post-processing normalisation
        if 'acmid' in o.json.keys() and not isinstance(
                o.json['acmid'], int) and o.json['acmid'].isdigit():
            o.json['acmid'] = int(o.json['acmid'])
    if 'eventuri' in o.json.keys():
        o.json['eventurl'] = o.json['eventuri']
        del o.json['eventuri']
    if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith(
            'https://'):
        o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://')
    nlines = sorted(json2lines(o.getJSON().split('\n')))
    if flines != plines:
        return 1
    elif plines != nlines:
        f = open(fn, 'w', encoding='utf-8')
        f.write(o.getJSON())
        f.close()
        return 2
    else:
        return 0
	# flatten conferences for easy lookup
	knownConfs = []
	for v in sleigh.venues:
		for c in v.getConfs():
			knownConfs.append(c.getKey())
	# print(knownConfs)
	print(C.purple('BibSLEIGH flattened to {} entities'.format(len(knownConfs))))
	# compressed error output
	dunno = []
	# Conference;Year;First Name;Last Name;Sex;Role
	for line in csv:
		name = (line[2] + ' ' + line[3]).strip()
		if name in established.keys():
			name = established[name]
		if name in renameto.keys():
			print('[', C.yellow('ALIA'), ']', 'Treating', name, 'as', renameto[name])
			established[name] = renameto[name]
			name = renameto[name]
		if name not in peoplekeys:
			# not really needed, but just for the sake of wider applicability in the future
			ndl = nomidnames(nodiaLatin(name)).lower()
			f = None
			for k in peoplekeys:
				if nomidnames(nodiaLatin(k)).lower() == ndl:
					f = k
					break
			if not f:
				if name not in dunno:
					print('[', C.red('PERS'), ']', 'Unacquainted with', name)
					dunno.append(name)
				continue
Exemple #39
0
if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    peoplez = glob.glob(ienputdir + '/people/*.json')
    print('{}: {} venues, {} papers by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(peoplez)),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    # stem ALL the papers!
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                cx[checkreport(p.filename, p, None)] += 1
        for b in v.getBrands():
            cx[checkreport(b.filename, None, b)] += 1
    # write all stems
    listOfStems = sorted(filter(ifApproved, ALLSTEMS),
                         key=lambda w: two(len(w)) + w)
    f = open(ienputdir + '/stems.json', 'w', encoding='utf-8')
    f.write('[\n\t"' + '",\n\t"'.join(listOfStems) + '"\n]')
    f.close()
    print(C.red(len(ALLSTEMS)), 'stems found.')
    print('{} files checked, {} ok, {} fixed, {} failed'.format(\
     C.bold(cx[0] + cx[1] + cx[2]),
     C.blue(cx[0]),
     C.yellow(cx[2]),
     C.red(cx[1])))
Exemple #40
0
def checkon(fn, o):
    if 'dblpkey' not in o.json.keys():
        print('[ {} ] {}'.format(C.red('DONT'),
                                 'DBLP key not found on the entry'))
        return 1
    mykey = o.get('dblpkey')
    # for the rare case of multiple dblpkeys
    # (can happen as a DBLP error or when same proceedings span over multiple volumes)
    if isinstance(mykey, list):
        mykey = mykey[0]
    if mykey not in procs.keys():
        print('[ {} ] {}'.format(C.red('DONT'),
                                 'DBLP key not found in the dump'))
        return 1
    title = procs[mykey]
    if title.endswith('.'):
        title = title[:-1]
    ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ')
    country = findOneIn(knownCountries, ws)
    state = findOneIn(usaStateNames, ws)
    found = False
    if country:
        town = ws[ws.index(country) - 1]
        state = '?'
        # what if "town" is an USA state? (full)
        if country == 'USA' and town in usaStateNames:
            state = town
            town = ws[ws.index(town) - 1]
        # what if "town" is an USA state? (abbreviated)
        if country == 'USA' and town in usaStateAB:
            state = usaStateNames[usaStateAB.index(town)]
            town = ws[ws.index(town) - 1]
        # what if "town" is a Canadian state? (full)
        if country == 'Canada' and town in canStateNames:
            state = town
            town = ws[ws.index(town) - 1]
        # what if "town" is a Canadian state? (abbreviated)
        if country == 'Canada' and town in canStateAB:
            state = canStateNames[canStateAB.index(town)]
            town = ws[ws.index(town) - 1]
        # the same can happen in the UK
        if country in ('UK', 'United Kingdom') and town in ('Scotland',
                                                            'Scottland'):
            state = town
            town = ws[ws.index(town) - 1]
        # Georgia the country vs Georgia the state
        if country == 'Georgia' and town == 'Atlanta':
            state = country
            country = 'USA'
        # near Something
        if town.startswith('near '):
            town = ws[ws.index(town) - 1]
        # Luxembourg, Luxembourg
        if country == 'Luxembourg':
            town = 'Luxembourg'
        # Saint-Malo / St. Malo
        if country == 'France' and town == 'St. Malo':
            town = 'Saint-Malo'
        # Florence / Firenze
        if country == 'Italy' and town.find('Firenze') > -1:
            town = 'Florence'
        found = True
    elif state:
        country = 'USA'
        town = ws[ws.index(state) - 1]
        found = True
    else:
        # desperate times
        for sol in desperateSolutions.keys():
            if sol in ws:
                town, state, country = desperateSolutions[sol]
                found = True
    # normalise
    if country in countryMap.keys():
        country = countryMap[country]
    if country == 'United Kingdom' and state == '?':
        if town.endswith('London') or town in ('Birmingham', 'York',\
        'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\
        'Southampton', 'Norwich', 'Leicester', 'Canterbury'):
            state = 'England'
        elif town in ('Edinburgh', 'Glasgow'):
            state = 'Scotland'
    # report
    if 'address' in o.json.keys():
        print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address')))
    if 'location' in o.json.keys():
        print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location')))
    if found:
        # print('[ {} ] {}'.format(C.blue('KNOW'), country))
        print('[ {} ] {}'.format(C.blue('AD||'), title))
        print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'),
                                                      C.yellow(town),
                                                      C.yellow(state),
                                                      C.yellow(country)))
        # TODO: perhaps later we can act more aggressively
        newaddr = [town, '' if state == '?' else state, country]
        if 'address' not in o.json.keys() or newaddr != o.json['address']:
            o.json['address'] = newaddr
            f = open(o.json['FILE'], 'w', encoding='utf-8')
            f.write(o.getJSON())
            f.close()
            return 2
        # nothing changed
        return 0
    print('[ {} ] {}'.format(C.yellow('AD??'), title))
    return 1