Ejemplo n.º 1
0
	def __init__(self, idir, name2file):
		super(Sleigh, self).__init__('', idir)
		self.venues = []
		self.n2f = name2file
		jsons = {}
		skip4Now = []
		for d in glob.glob(idir+'/*.json'):
			if d.split('/')[-1].split('.')[0] in skip4Now:
				print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now'))
				continue
			jsons[d.split('/')[-1].split('.')[0]] = d
		for d in glob.glob(idir+'/*'):
			cont = False
			for end in ('.md', '.json', '/frem', '/edif'):
				if d.endswith(end):
					cont = True
			if d.split('/')[-1] in skip4Now:
				print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now'))
				cont = True
			if cont:
				continue
			if d.split('/')[-1] not in jsons.keys():
				print(C.red('Legacy non-top definition of'), d)
				self.venues.append(Venue(d, idir, name2file, self))
			else:
				self.venues.append(Venue(d, idir, name2file, self))
Ejemplo n.º 2
0
 def __init__(self, idir, name2file):
     super(Sleigh, self).__init__('', idir)
     self.venues = []
     self.n2f = name2file
     jsons = {}
     skip4Now = []
     for d in glob.glob(idir + '/*.json'):
         if lastSlash(d).split('.')[0] in skip4Now:
             print(
                 C.red('Skipping') + ' ' + C.purple(d) + ' ' +
                 C.red('for now'))
             continue
         jsons[lastSlash(d).split('.')[0]] = d
     for d in glob.glob(idir + '/*'):
         cont = False
         for end in ('.md', '.json', '/frem', '/edif'):
             if d.endswith(end):
                 cont = True
         if d.split('/')[-1] in skip4Now:
             print(
                 C.red('Skipping') + ' ' + C.purple(d) + ' ' +
                 C.red('for now'))
             cont = True
         if cont:
             continue
         if lastSlash(d) not in jsons.keys():
             print(C.red('Legacy non-top definition of'), d)
             if lastSlash(d) not in ('edif', 'frem'):
                 self.venues.append(Venue(d, idir, name2file, self))
         else:
             self.venues.append(Venue(d, idir, name2file, self))
Ejemplo n.º 3
0
 def __init__(self, d, hdir, name2file, parent):
     super(Venue, self).__init__(d, hdir)
     self.years = []
     self.brands = []
     self.n2f = name2file
     if os.path.exists(d + '.json'):
         # new style
         # print(C.blue(d), 'is new style')
         self.json = parseJSON(d + '.json')
     else:
         # legacy style
         print(C.red(d), 'is legacy style')
         self.json = {}
     for f in glob.glob(d + '/*.json'):
         if not self.json:
             self.json = parseJSON(f)
         else:
             self.brands.append(Brand(f, self.homedir, name2file, self))
     for f in glob.glob(d + '/*'):
         if f.endswith('.json'):
             # already processed
             continue
         elif os.path.isdir(f):
             y = Year(f, self.homedir, name2file, self)
             self.years.append(y)
             for b in self.brands:
                 for c in y.confs:
                     b.offer(y.year, c)
         else:
             print('File out of place:', f)
     self.back = parent
Ejemplo n.º 4
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('WARN'))
	r, msg = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}: {}'.format(statuses[r], fn, msg))
	return r
Ejemplo n.º 5
0
def report(fn, r):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX'))
    special = ('', '- no crossref found!', '- illegal crossref')
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {} {}'.format(statuses[r], fn, special[r]))
    return r
Ejemplo n.º 6
0
def processSortedRel(r):
	# [ {"x" : Y } ] where Y can be a string or a sorted rel
	global pcx
	acc = []
	for el in r:
		ename = list(el.keys())[0]
		evals = el[ename]
		if os.path.isfile(outputdir + '/stuff/' + ename.lower() + '.png'):
			img = '<img src="../stuff/{1}.png" alt="{0}" width="30px"/> '.format(ename, ename.lower())
		else:
			img = ''
		if isinstance(evals, str):
			plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages)
			pcx += len(plst)
			ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>'
		elif isinstance(evals, list) and isinstance(evals[0], str):
			plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages)
			pcx += len(plst)
			ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>'
		elif isinstance(evals, list) and isinstance(evals[0], dict):
			ptxt = processSortedRel(evals)
		else:
			print(C.red('ERROR:'), 'unrecornised bundle structure', evals)
		acc.append('<dl><dt>{}{}</dt><dd>{}</dl>'.format(img, ename, ptxt))
	return '\n'.join(acc)
Ejemplo n.º 7
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if not os.path.exists(fn):
		# if it still does not exist, let us create a minimal one
		f = open(fn, 'w')
		f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\
			name=fn.split('/')[-1][:-5].replace('-', ' '),
			year=findYear(fn.split('/')[-1])\
		))
		f.close()
		print('[ {} ] {}'.format(C.yellow('MADE'), fn))
		return 2
	f = open(fn, 'r')
	lines = f.readlines()[1:-1]
	f.close()
	for line in lines:
		if line.find('"year"') > -1 and findYear(line) > 3000:
			os.remove(fn)
			print('[ {} ] {}'.format(C.red('KILL'), fn))
			return 1
	flines = sorted([strictstrip(s) for s in lines])
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if flines != plines:
		f1 = [line for line in flines if line not in plines]
		f2 = [line for line in plines if line not in flines]
		print('∆:', f1, '\nvs', f2)
	if flines == plines:
		return 0
	else:
		return 1
Ejemplo n.º 8
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if not os.path.exists(fn):
		# if it still does not exist, let us create a minimal one
		f = open(fn, 'w', encoding='utf-8')
		f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\
			name=lastSlash(fn)[:-5].replace('-', ' '),
			year=findYear(lastSlash(fn))\
		))
		f.close()
		print('[ {} ] {}'.format(C.yellow('MADE'), fn))
		return 2
	f = open(fn, 'r', encoding='utf-8')
	lines = f.readlines()[1:-1]
	f.close()
	for line in lines:
		if line.find('"year"') > -1 and findYear(line) > 3000:
			os.remove(fn)
			print('[ {} ] {}'.format(C.red('KILL'), fn))
			return 1
	flines = sorted([strictstrip(s) for s in lines])
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if flines != plines:
		f1 = [line for line in flines if line not in plines]
		f2 = [line for line in plines if line not in flines]
		print('∆:', f1, '\nvs', f2)
	if flines == plines:
		return 0
	else:
		return 1
Ejemplo n.º 9
0
def checkreport(m, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(m, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], o.filename))
	return r
Ejemplo n.º 10
0
def report(fn, r):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX'))
	special = ('', '- no crossref found!', '- illegal crossref')
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {} {}'.format(statuses[r], fn, special[r]))
	return r
Ejemplo n.º 11
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		report(statuses[r], fn)
	return r
Ejemplo n.º 12
0
def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r
Ejemplo n.º 13
0
	def __init__(self, d, hdir, name2file, parent):
		super(Venue, self).__init__(d, hdir)
		self.years = []
		self.brands = []
		self.n2f = name2file
		if os.path.exists(d+'.json'):
			# new style
			# print(C.blue(d), 'is new style')
			self.json = parseJSON(d+'.json')
		else:
			# legacy style
			print(C.red(d), 'is legacy style')
			self.json = []
		for f in glob.glob(d+'/*.json'):
			if not self.json:
				self.json = parseJSON(f)
			else:
				self.brands.append(Brand(f, self.homedir, name2file, self))
		for f in glob.glob(d+'/*'):
			if f.endswith('.json'):
				# already processed
				continue
			elif os.path.isdir(f):
				y = Year(f, self.homedir, name2file, self)
				self.years.append(y)
				for b in self.brands:
					for c in y.confs:
						b.offer(y.year, c)
			else:
				print('File out of place:', f)
		self.back = parent
Ejemplo n.º 14
0
def parseJSON(fn):
	# print('Parsing',fn,'...')
	try:
		j = json.load(open(fn, 'r', encoding='utf-8'))
		j['FILE'] = fn
		return j
	except ValueError:
		print(C.red('JSON parse error'), 'on', fn.replace('\\', '/'))
		return {}
Ejemplo n.º 15
0
def parseJSON(fn):
	# print('Parsing',fn,'...')
	try:
		j = json.load(open(fn, 'r'))
		j['FILE'] = fn
		return j
	except ValueError:
		print(C.red('JSON parse error'), 'on', fn)
		return {}
Ejemplo n.º 16
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	if isinstance(o, int):
		r = o
	else:
		r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r
Ejemplo n.º 17
0
def checkreport(fn, o, br):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	if br:
		r = checkbrand(fn, br)
	else:
		r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r
Ejemplo n.º 18
0
def checkon(m, o):
	# if no common model found, we failed
	if not m:
		return 1
	if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'):
		m['type'] = 'proceedings'
	if 'type' in m.keys() and m['type'] == 'incollection':
		m['type'] = 'book'
	if 'crossref' in m.keys():
		del m['crossref']
	if 'booktitle' in m.keys():
		m['title'] = m['booktitle']
		del m['booktitle']
	if 'booktitleshort' in m.keys():
		# TODO: ???
		del m['booktitleshort']
	r = 0
	n = {}
	for k in m.keys():
		if o.get(k) == m[k]:
			if verbose:
				print(C.blue('Confirmed:  '), k, 'as', m[k])
		else:
			if verbose:
				print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k))
			v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k]
			if verbose:
				print(C.yellow('Settled for:'), v)
			n[k] = v
			r = 2
	if r == 0:
		return r
	if r == 2 and not n:
		# nothing to fix?!
		return 0
	if not os.path.exists(o.filename):
		return 0
	if os.path.isdir(o.filename):
		fn = o.filename + '.json'
	else:
		fn = o.filename
	if os.path.exists(fn):
		f = open(fn, 'r', encoding='utf-8')
		lines = f.read()
		f.close()
		if lines != o.getJSON():
			# strange, should be equal (run all normalisers first!)
			return 1
	for k in n.keys():
		o.json[k] = n[k]
	f = open(fn, 'w', encoding='utf-8')
	f.write(o.getJSON())
	f.close()
	return 2
Ejemplo n.º 19
0
def checkon(m, o):
	# if no common model found, we failed
	if not m:
		return 1
	if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'):
		m['type'] = 'proceedings'
	if 'type' in m.keys() and m['type'] == 'incollection':
		m['type'] = 'book'
	if 'crossref' in m.keys():
		del m['crossref']
	if 'booktitle' in m.keys():
		m['title'] = m['booktitle']
		del m['booktitle']
	if 'booktitleshort' in m.keys():
		# TODO: ???
		del m['booktitleshort']
	r = 0
	n = {}
	for k in m.keys():
		if o.get(k) == m[k]:
			if verbose:
				print(C.blue('Confirmed:  '), k, 'as', m[k])
		else:
			if verbose:
				print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k))
			v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k]
			if verbose:
				print(C.yellow('Settled for:'), v)
			n[k] = v
			r = 2
	if r == 0:
		return r
	if r == 2 and not n:
		# nothing to fix?!
		return 0
	if not os.path.exists(o.filename):
		return 0
	if os.path.isdir(o.filename):
		fn = o.filename + '.json'
	else:
		fn = o.filename
	if os.path.exists(fn):
		f = open(fn, 'r')
		lines = f.read()
		f.close()
		if lines != o.getJSON():
			# strange, should be equal (run all normalisers first!)
			return 1
	for k in n.keys():
		o.json[k] = n[k]
	f = open(fn, 'w')
	f.write(o.getJSON())
	f.close()
	return 2
Ejemplo n.º 20
0
def guessYear(p):
	cys = [int(w) for w in p.split('-') if len(w) == 4 and w.isdigit()]
	if len(cys) == 1:
		return cys[0]
	else:
		j = sleigh.seekByKey(p)
		if 'year' in j.json.keys():
			return j.get('year')
		elif 'year' in dir(j):
			return j.year
		else:
			print('[ {} ] {}'.format(C.red('YEAR'), p))
			return 0
Ejemplo n.º 21
0
def guessYear(P):
    cys = [int(w) for w in P.split('-') if len(w) == 4 and w.isdigit()]
    if len(cys) == 1:
        return cys[0]
    else:
        j = sleigh.seekByKey(P)
        if 'year' in j.json.keys():
            return j.get('year')
        elif 'year' in dir(j):
            return j.year
        else:
            print('[ {} ] {}'.format(C.red('YEAR'), P))
            return 0
Ejemplo n.º 22
0
def sortbypages(z):
    if 'pages' not in z.json.keys():
        print(C.red('No pages at all in ' + z.getKey()))
        return 0
    p1, _ = z.getPagesTuple()
    y = z.get('year')
    if isinstance(y, str):
        # non-correcting robustness
        return 0
    # a trick to have several volumes within one conference
    v = z.get('volume')
    if isinstance(v, int) or v.isdigit():
        y += int(v)
    return y + p1 / 10000. if p1 else y
Ejemplo n.º 23
0
def sortbypages(z):
	if 'pages' not in z.json.keys():
		print(C.red('No pages at all in '+z.getKey()))
		return 0
	p1, _ = z.getPagesTuple()
	y = z.get('year')
	if isinstance(y, str):
		# non-correcting robustness
		return 0
	# a trick to have several volumes within one conference
	v = z.get('volume')
	if isinstance(v, int) or v.isdigit():
		y += int(v)
	return y + p1 / 10000. if p1 else y
Ejemplo n.º 24
0
def checkon(fn, o):
    if not os.path.exists(fn) or os.path.isdir(fn):
        fn = fn + '.json'
    if 'title' not in o.json.keys():
        if verbose:
            print('No title in', o.getKey())
        return 1  # no title
    # check for a different language - to avoid stemming altogether
    if o.tags and ('german' in o.tags or 'french' in o.tags
                   or 'portuguese' in o.tags):
        if 'stemmed' in o.json.keys():
            # if stemmed before marked foreign, remove this info
            del o.json['stemmed']
            F = open(fn, 'w', encoding='utf-8')
            F.write(o.getJSON())
            F.close()
            return 2
        else:
            return 0
    changed = False
    ### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
    stemmer = snowballstemmer.stemmer('english').stemWords
    ### disregarded variant: snowballstemmer porter - considered outdated
    # stemmer = snowballstemmer.stemmer('porter').stemWords
    ### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
    # stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
    ### disregarded variant: nltk - worse on verbs ending with -ze
    # stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
    ### end variants
    stemmed = stemmer(string2words(o.get('title')))
    if '' in stemmed:
        print('“{}” is a title of {} and it has an empty word'.format(
            o.get('title'), C.red(o.getKey())))
        print(string2words(o.get('title')))
        print(stemmer(string2words(o.get('title'))))
    ALLSTEMS.update(stemmed)
    if o.get('stemmed') != stemmed:
        o.json['stemmed'] = stemmed
        changed = True
    if changed:
        F = open(fn, 'w', encoding='utf-8')
        F.write(o.getJSON())
        F.close()
        return 2
    else:
        return 0
Ejemplo n.º 25
0
def dblpify(s):
	# http://dblp.uni-trier.de/pers/hd/e/Elbaum:Sebastian_G=
	if s in dis.keys():
		return dis[s]
	if s.find(' ') < 0:
		print('[', C.red('NAME'), ']', 'Unconventional full name:', s)
		cx[1] += 1
		return dblpLatin(s)+':'
	ws = s.split(' ')
	i = -1
	if ws[i] in ('Jr', 'Jr.'):
		i -= 1
	sur = dblpLatin(' '.join(ws[i:]))
	rest = dblpLatin(' '.join(ws[:i])).replace(' ', '_')
	for c in ".'-":
		rest = rest.replace(c, '=')
	return sur+':'+rest
Ejemplo n.º 26
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if 'title' not in o.json.keys():
		if verbose:
			print('No title in', o.getKey())
		return 1 # no title
	# check for a different language - to avoid stemming altogether
	if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags):
		if 'stemmed' in o.json.keys():
			# if stemmed before marked foreign, remove this info
			del o.json['stemmed']
			F = open(fn, 'w')
			F.write(o.getJSON())
			F.close()
			return 2
		else:
			return 0
	changed = False
	### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
	stemmer = snowballstemmer.stemmer('english').stemWords
	### disregarded variant: snowballstemmer porter - considered outdated
	# stemmer = snowballstemmer.stemmer('porter').stemWords
	### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
	# stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
	### disregarded variant: nltk - worse on verbs ending with -ze
	# stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
	### end variants
	stemmed = stemmer(string2words(o.get('title')))
	if '' in stemmed:
		print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey())))
		print(string2words(o.get('title')))
		print(stemmer(string2words(o.get('title'))))
	ALLSTEMS.update(stemmed)
	if o.get('stemmed') != stemmed:
		o.json['stemmed'] = stemmed
		changed = True
	if changed:
		F = open(fn, 'w')
		F.write(o.getJSON())
		F.close()
		return 2
	else:
		return 0
Ejemplo n.º 27
0
 def seekByKey(self, key):
     f = None
     # trying a shortcut
     hv = key.split('-')[0]
     for v in self.venues:
         if v.getKey() == hv:
             # print('\tShortcut to', hv)
             f = v.seekByKey(key)
             if f:
                 return f
             # else:
             # 	print('\t', C.red('...failed'))
     # bruteforce search
     # print('\tBrute force searching for', key)
     for v in self.venues:
         f = v.seekByKey(key)
         if f:
             return f
     print(C.red(key), ' not found in BibSLEIGH!')
     return f
Ejemplo n.º 28
0
def main():
    print('{}: {} venues, {} papers\n{}'.format(C.purple('BibSLEIGH'),
                                                C.red(len(sleigh.venues)),
                                                C.red(sleigh.numOfPapers()),
                                                C.purple('=' * 42)))
    # generate the index
    f = open(outputdir + '/index.html', 'w', encoding='utf-8')
    f.write(sleigh.getPage())
    f.close()
    # generate all individual pages
    # if False:
    for v in sleigh.venues:
        r = C.blue(v.getKey())
        f = open(outputdir + '/' + v.getKey() + '.html', 'w', encoding='utf-8')
        f.write(v.getPage())
        f.close()
        if v.brands:
            r += '{' + '+'.join([C.blue(b.getKey()) for b in v.brands]) + '}'
            for b in v.brands:
                f = open(outputdir + '/' + b.getKey() + '.brand.html',
                         'w',
                         encoding='utf-8')
                f.write(b.getPage())
                f.close()
        r += ' => '
        for c in v.getConfs():
            f = open(outputdir + '/' + c.getKey() + '.html',
                     'w',
                     encoding='utf-8')
            f.write(c.getPage())
            f.close()
            for p in c.papers:
                f = open(outputdir + '/' + p.getKey() + '.html',
                         'w',
                         encoding='utf-8')
                f.write(p.getPage())
                f.close()
            purekey = c.getKey().replace(v.getKey(), '').replace('-',
                                                                 ' ').strip()
            r += '{} [{}], '.format(purekey, C.yellow(len(c.papers)))
        print(r)
    # generate the icon lineup
    icons = []
    linked = []
    pngs = [
        lastSlash(png).split('.')[0]
        for png in glob.glob(outputdir + '/stuff/*.png')
    ]
    pngs = [png for png in pngs \
            if not (png.startswith('a-') or png.startswith('p-') or png.startswith('ico-')
                    or png in ('cc-by', 'xhtml', 'css', 'open-knowledge', 'edit'))]
    for brand in glob.glob(outputdir + '/*.brand.html'):
        pure = lastSlash(brand).split('.')[0]
        img = pure.lower().replace(' ', '')
        if img in pngs:
            pic = '<div class="wider"><a href="{0}.brand.html"><img class="abc" src="{1}" alt="{0}"/></a><span>{0}</span></div>'.format( \
                pure,
                'stuff/' + img + '.png')
            pngs.remove(img)
            icons.append(pic)
        else:
            # print('No image for', pure)
            pass
    corner = {
        'ada': 'TRI-Ada',
        'comparch': 'CompArch',
        'floc': 'FLoC',
        'bibsleigh': 'index'
    }
    for pure in pngs:
        venueCandidate = corner[pure] if pure in corner else pure.upper()
        canlink = sorted(glob.glob(outputdir + '/' + venueCandidate +
                                   '*.html'),
                         key=len)
        if canlink:
            pic = '<div class="wider"><a href="{0}"><img class="abc" src="stuff/{1}.png" alt="{2}"/></a><span>{2}</span></div>'.format( \
                canlink[0].split('/')[-1],
                pure,
                venueCandidate,
                canlink[0].split('/')[0])
        elif pure == 'twitter':
            pic = '<div class="wider"><a href="https://about.twitter.com/company/brand-assets"><img class="abc" src="stuff/twitter.png" alt="Twitter"/></a><span>Twitter</span></div>'
        elif pure == 'email':
            pic = '<div class="wider"><a href="mailto:[email protected]"><img class="abc" src="stuff/email.png" alt="e-mail"/></a><span>email</span></div>'
        else:
            print('Lonely', pure)
            pic = '<img class="abc" src="stuff/{0}.png" alt="{0}"/>'.format(
                pure)
        icons.append(pic)
    # find last year of each venue
    # for ven in glob.glob(corpusdir + '/*'):
    # 	venname = lastSlash(ven)
    # 	newstuff += '<strong><a href="http://dblp.uni-trier.de/db/conf/{}/">{} {}</a></strong>, '.format(venname.lower(), venname, nextYear(ven))
    # print(lastSlash(ven), ':', lastYear(ven))
    # write "more info" file
    f = open(outputdir + '/about.html', 'w', encoding='utf-8')
    f.write(
        aboutHTML.format(
            len(icons),
            '<div class="minibar">' + '\n'.join(sorted(icons)) + '</div>'))
    f.close()

    # generate the DBLP sync page
    cell_by_conf_by_year = {}
    Ys = [
        2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009
    ]
    dblplinks = {}

    with open(ienputdir + '/meta/dblpguide.sync', 'r') as f:
        for line in f:
            if not line or line.startswith('#'):
                continue
            words = line.split('|')
            if len(words) != 3:
                print('- Metaline {} skipped!'.format(words))
                continue
            name = words[0].strip()
            dome = words[1].strip()
            dblp = words[2].strip()
            cell_by_conf_by_year[name] = {}
            dblplinks[name] = dblp
            for y in Ys:
                cell_by_conf_by_year[name][y] = '(no)'
            v = sleigh.getVenue(dome)
            if v:
                for yy in Ys:
                    y = v.getYear(yy)
                    if y:
                        ckey = '{}-{}'.format(name, yy)
                        c = y.getConf(ckey)
                        if c:
                            cell_by_conf_by_year[name][yy] = c.getIconItem2(
                                '', '')
                        else:
                            # print('- Conference {} of year {} in venue {} not found in the corpus'.format(ckey, yy, name))
                            for alt in 'v1', 'p1', 'c1', '1', 'J':
                                ckey = '{}-{}-{}'.format(name, alt, yy)
                                c = y.getConf(ckey)
                                if c:
                                    cell_by_conf_by_year[name][
                                        yy] = c.getIconItem2('', '')
                                    break
                # else:
                # 	print('- Year {} in venue {} not found in the corpus among {}'.format(yy, name, [z.year for z in v.years]))
        # else:
        # 	print('- Venue {} not found in the corpus'.format(name))

    table = '<table>'
    table += '<tr><td></td>'
    for y in Ys:
        table += '<th>{}</th>\n'.format(y)
    table += '</tr>'
    # print (cell_by_conf_by_year)
    for name in sorted(cell_by_conf_by_year.keys()):
        table += '<tr><th><a href="{}.brand.html">[@]</a> <a href="{}">{}</a></th>'.format(
            name, dblplinks[name], name)
        for y in Ys:
            table += '<td>{}</td>\n'.format(cell_by_conf_by_year[name][y])
        table += '</tr>'
    table += '</table>'

    with open(outputdir + '/sync.html', 'w', encoding='utf-8') as f:
        f.write(syncHTML.format(table))

    print('{}\nDone with {} venues, {} papers.'.format(
        C.purple('=' * 42), C.red(len(sleigh.venues)),
        C.red(sleigh.numOfPapers())))
Ejemplo n.º 29
0
    if isinstance(o, int):
        r = o
    else:
        r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    peoplez = glob.glob(ienputdir + '/people/*.json')
    print('{}: {} venues, {} papers by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(peoplez)),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                cx[checkreport(p.filename, p)] += 1
    print('{} files checked, {} ok, {} fixed, {} failed'.format(\
     C.bold(cx[0] + cx[1] + cx[2]),
     C.blue(cx[0]),
     C.yellow(cx[2]),
     C.red(cx[1])))
    print(C.red('{} files to check manually!'.format(len(warnings))))
    print('subl ', ' '.join(warnings))
Ejemplo n.º 30
0
def checkon(fn, o):
    if os.path.isdir(fn):
        fn = fn + '.json'
    f = open(fn, 'r', encoding='utf-8')
    lines = f.readlines()[1:-1]
    f.close()
    flines = [strictstrip(s) for s in lines]
    plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
    if flines != plines:
        return 1
    ts = []
    # precise case-sensitive match
    mcs = o.get('title')
    # precise match for substrings
    mes = baretext(mcs)
    # precise match for words
    mew = mes.split(' ')
    # imprecise match for substrings
    mis = superbaretext(mes)
    # imprecise match for words
    miw = mis.split(' ')
    # now match!
    for t in tags:
        # print('Check',t,'vs',mes)
        if 'name' not in t.keys():
            print(C.red('ERROR:'), 'no name for tag from file', t['FILE'])
            continue
        if all([not k.startswith('match') for k in t.keys()]):
            print(C.red('ERROR:'), 'no match rules for tag', t['name'])
            continue
        for k in t.keys():
            if k == 'matchentry':
                if o.getKey() in t[k]:
                    ts += [t['name']]
            elif k.startswith('match'):
                ts += [
                    t['name'] for s in listify(t[k])
                    if matchModes[k](s, mcs, mes, mew, mis, miw)
                ]
                # ts += [t['name'] for s in listify(t[k]) if fmm(t, k, s, mcs, mes, mew, mis, miw)]
    # second pass: check reliefs
    for t in tags:
        if 'relieves' in t.keys():
            for r in listify(t['relieves']):
                if t['name'] in ts and r in ts:
                    ts.remove(r)
                    if t['name'] not in relieved.keys():
                        relieved[t['name']] = 0
                    relieved[t['name']] += 1
    if ts:
        if not o.tags:
            o.tags = []
        for t in ts:
            if t not in o.tags:
                o.tags.append(t)
    # uncomment the following one line to overwrite all tags
    o.tags = uniq(ts)
    # let’s keep tags clean and sorted
    o.tags = sorted(o.tags)
    nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
    if plines != nlines:
        f = open(fn, 'w', encoding='utf-8')
        f.write(o.getJSON())
        f.close()
        return 2
    else:
        return 0
Ejemplo n.º 31
0
			paperAuths = paperAuths[:-1]
			paperAuths.extend(auths)
		paperLnk = li.get('id')
		hope = li.find_all('a')
		if hope and hope[0].get('href').endswith('.pdf'):
			paperPdf = urlstart + hope[0].get('href')
		else:
			paperPdf = ''
		paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\
			'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\
			'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\
			'author': paperAuths, 'pages': paperPages, 'venue': volVenue}
		if paperPdf:
			paperEntry['openpdf'] = paperPdf
		if paperLnk:
			paperEntry['url'] = urlstart + '#' + paperLnk
		paperFilename = outputdir.split('/')[-1] + '-' + paperAuths[0].split(' ')[-1]
		for a in paperAuths[1:]:
			paperFilename += a.split(' ')[-1][0]
		if paperFilename in done:
			paperFilename += 'a'
			while paperFilename in done:
				paperFilename = paperFilename[:-1] + chr(ord(paperFilename[-1])+1)
		# print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json')
		f = open(outputdir+'/'+paperFilename+'.json', 'w')
		f.write(jsonify(paperEntry))
		f.close()
		cx += 1
		done.append(paperFilename)
	print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
Ejemplo n.º 32
0
	else:
		name = sys.argv[1]
		path = ienputdir + '/corpus/' + name
		namem = name.split('/')[-1]
	cx = {0: 0, 1: 0, 2: 0}
	if not os.path.exists(path):
		report(name, name, 1)
		sys.exit(1)
	# for all papers...
	for fn in glob.glob(path + '/*.json'):
		pureold = fn.split(namem+'/')[1]
		if pureold.endswith('.json'):
			pureold = pureold[:-5]
		purenew = pureold
		if purenew[-2:] == namem[-2:]:
			purenew = purenew[:-2]
		if pureold == purenew:
			cx[report(pureold, purenew, 0)] += 1
		elif not os.path.exists(ienputdir + '/corpus/' + name + '/' + pureold + '.json')\
			 and os.path.exists(ienputdir + '/corpus/' + name + '/' + purenew + '.json'):
			cx[report(pureold, purenew, 1)] += 1
		else:
			cx[report(pureold, purenew, 2)] += 1
			os.rename(ienputdir + '/corpus/' + name + '/' + pureold + '.json', \
					  ienputdir + '/corpus/' + name + '/' + purenew + '.json')
	print('{} files checked, {} ok, {} fixed, {} failed'.format(\
		C.bold(cx[0] + cx[1] + cx[2]),
		C.blue(cx[0]),
		C.yellow(cx[2]),
		C.red(cx[1])))
Ejemplo n.º 33
0
def checkon(fn, o):
	if 'dblpkey' not in o.json.keys():
		print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry'))
		return 1
	mykey = o.get('dblpkey')
	# for the rare case of multiple dblpkeys
	# (can happen as a DBLP error or when same proceedings span over multiple volumes)
	if isinstance(mykey, list):
		mykey = mykey[0]
	if mykey not in procs.keys():
		print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump'))
		return 1
	title = procs[mykey]
	if title.endswith('.'):
		title = title[:-1]
	ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ')
	country = findOneIn(knownCountries, ws)
	state = findOneIn(usaStateNames, ws)
	found = False
	if country:
		town = ws[ws.index(country)-1]
		state = '?'
		# what if "town" is an USA state? (full)
		if country == 'USA' and town in usaStateNames:
			state = town
			town = ws[ws.index(town)-1]
		# what if "town" is an USA state? (abbreviated)
		if country == 'USA' and town in usaStateAB:
			state = usaStateNames[usaStateAB.index(town)]
			town = ws[ws.index(town)-1]
		# what if "town" is a Canadian state? (full)
		if country == 'Canada' and town in canStateNames:
			state = town
			town = ws[ws.index(town)-1]
		# what if "town" is a Canadian state? (abbreviated)
		if country == 'Canada' and town in canStateAB:
			state = canStateNames[canStateAB.index(town)]
			town = ws[ws.index(town)-1]
		# the same can happen in the UK
		if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'):
			state = town
			town = ws[ws.index(town)-1]
		# Georgia the country vs Georgia the state
		if country == 'Georgia' and town == 'Atlanta':
			state = country
			country = 'USA'
		# near Something
		if town.startswith('near '):
			town = ws[ws.index(town)-1]
		# Luxembourg, Luxembourg
		if country == 'Luxembourg':
			town = 'Luxembourg'
		# Saint-Malo / St. Malo
		if country == 'France' and town == 'St. Malo':
			town = 'Saint-Malo'
		# Florence / Firenze
		if country == 'Italy' and town.find('Firenze') > -1:
			town = 'Florence'
		found = True
	elif state:
		country = 'USA'
		town = ws[ws.index(state)-1]
		found = True
	else:
		# desperate times
		for sol in desperateSolutions.keys():
			if sol in ws:
				town, state, country = desperateSolutions[sol]
				found = True
	# normalise
	if country in countryMap.keys():
		country = countryMap[country]
	if country == 'United Kingdom' and state == '?':
		if town.endswith('London') or town in ('Birmingham', 'York',\
		'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\
		'Southampton', 'Norwich', 'Leicester', 'Canterbury'):
			state = 'England'
		elif town in ('Edinburgh', 'Glasgow'):
			state = 'Scotland'
	# report
	if 'address' in o.json.keys():
		print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address')))
	if 'location' in o.json.keys():
		print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location')))
	if found:
		# print('[ {} ] {}'.format(C.blue('KNOW'), country))
		print('[ {} ] {}'.format(C.blue('AD||'), title))
		print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country)))
		# TODO: perhaps later we can act more aggressively
		newaddr = [town, '' if state=='?' else state, country]
		if 'address' not in o.json.keys() or newaddr != o.json['address']:
			o.json['address'] = newaddr
			f = open(o.json['FILE'], 'w')
			f.write(o.getJSON())
			f.close()
			return 2
		# nothing changed
		return 0
	print('[ {} ] {}'.format(C.yellow('AD??'), title))
	return 1
Ejemplo n.º 34
0
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	if isinstance(o, int):
		r = o
	else:
		r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r

if __name__ == "__main__":
	verbose = sys.argv[-1] == '-v'
	peoplez = glob.glob(ienputdir + '/people/*.json')
	print('{}: {} venues, {} papers by {} people\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.red(len(peoplez)),
		C.purple('='*42)))
	cx = {0: 0, 1: 0, 2: 0}
	for v in sleigh.venues:
		for c in v.getConfs():
			for p in c.papers:
				cx[checkreport(p.filename, p)] += 1
	print('{} files checked, {} ok, {} fixed, {} failed'.format(\
		C.bold(cx[0] + cx[1] + cx[2]),
		C.blue(cx[0]),
		C.yellow(cx[2]),
		C.red(cx[1])))
	print(C.red('{} files to check manually!'.format(len(warnings))))
	print('subl ', ' '.join(warnings))
Ejemplo n.º 35
0
def sdistance(x1, x2):
    return str(distance(x1, x2)).replace('.', ',')


def distance(x1, x2):
    return sqrt(sum([(x1[jj] - x2[jj])**2 for jj in range(0, len(x1))]))


# NB: some clustering/visualisation code based on http://brandonrose.org/clustering
if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    peoplez = glob.glob(ienputdir + '/people/*.json')
    print('{}: {} venues, {} papers by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(peoplez)),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    # we need to know all the words we have
    UberDict = set()
    UberCols = set()
    vocs = {b.getKey():b.json['vocabulary'] \
     for v in sleigh.venues \
     for b in v.getBrands() \
     if 'vocabulary' in b.json \
     if len(b.json['vocabulary']) > 10}
    for vkey in vocs:
        UberDict.update(vocs[vkey].keys())
    # collocations are not quantified!
Ejemplo n.º 36
0
def report(s, r):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], simpleLatin(s)))
    return r
Ejemplo n.º 37
0
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], s))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    # Load all contributors
    people = {}
    for fn in glob.glob(ienputdir + '/people/*.json'):
        p = parseJSON(fn)
        people[p['name']] = p
    print('{}: {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(people)),
     C.purple('='*42)))
    # check for duplicates
    bysurname = {}
    for name in people.keys():
        byword = name.split(' ')
        j = -1
        while -j < len(byword) and (byword[j - 1][0].islower()
                                    or byword[j - 1].lower()
                                    in ('de', 'di', 'du', 'van', 'von', 'le'
                                        'la')):
            j -= 1
        surname = ' '.join(byword[j:])
        firstnames = ' '.join(byword[:j])
        if verbose:
            print('Thinking “{}” is “{}” + “{}”'.format(
Ejemplo n.º 38
0
def checkon(fn, o):
    if 'dblpkey' not in o.json.keys():
        print('[ {} ] {}'.format(C.red('DONT'),
                                 'DBLP key not found on the entry'))
        return 1
    mykey = o.get('dblpkey')
    # for the rare case of multiple dblpkeys
    # (can happen as a DBLP error or when same proceedings span over multiple volumes)
    if isinstance(mykey, list):
        mykey = mykey[0]
    if mykey not in procs.keys():
        print('[ {} ] {}'.format(C.red('DONT'),
                                 'DBLP key not found in the dump'))
        return 1
    title = procs[mykey]
    if title.endswith('.'):
        title = title[:-1]
    ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ')
    country = findOneIn(knownCountries, ws)
    state = findOneIn(usaStateNames, ws)
    found = False
    if country:
        town = ws[ws.index(country) - 1]
        state = '?'
        # what if "town" is an USA state? (full)
        if country == 'USA' and town in usaStateNames:
            state = town
            town = ws[ws.index(town) - 1]
        # what if "town" is an USA state? (abbreviated)
        if country == 'USA' and town in usaStateAB:
            state = usaStateNames[usaStateAB.index(town)]
            town = ws[ws.index(town) - 1]
        # what if "town" is a Canadian state? (full)
        if country == 'Canada' and town in canStateNames:
            state = town
            town = ws[ws.index(town) - 1]
        # what if "town" is a Canadian state? (abbreviated)
        if country == 'Canada' and town in canStateAB:
            state = canStateNames[canStateAB.index(town)]
            town = ws[ws.index(town) - 1]
        # the same can happen in the UK
        if country in ('UK', 'United Kingdom') and town in ('Scotland',
                                                            'Scottland'):
            state = town
            town = ws[ws.index(town) - 1]
        # Georgia the country vs Georgia the state
        if country == 'Georgia' and town == 'Atlanta':
            state = country
            country = 'USA'
        # near Something
        if town.startswith('near '):
            town = ws[ws.index(town) - 1]
        # Luxembourg, Luxembourg
        if country == 'Luxembourg':
            town = 'Luxembourg'
        # Saint-Malo / St. Malo
        if country == 'France' and town == 'St. Malo':
            town = 'Saint-Malo'
        # Florence / Firenze
        if country == 'Italy' and town.find('Firenze') > -1:
            town = 'Florence'
        found = True
    elif state:
        country = 'USA'
        town = ws[ws.index(state) - 1]
        found = True
    else:
        # desperate times
        for sol in desperateSolutions.keys():
            if sol in ws:
                town, state, country = desperateSolutions[sol]
                found = True
    # normalise
    if country in countryMap.keys():
        country = countryMap[country]
    if country == 'United Kingdom' and state == '?':
        if town.endswith('London') or town in ('Birmingham', 'York',\
        'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\
        'Southampton', 'Norwich', 'Leicester', 'Canterbury'):
            state = 'England'
        elif town in ('Edinburgh', 'Glasgow'):
            state = 'Scotland'
    # report
    if 'address' in o.json.keys():
        print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address')))
    if 'location' in o.json.keys():
        print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location')))
    if found:
        # print('[ {} ] {}'.format(C.blue('KNOW'), country))
        print('[ {} ] {}'.format(C.blue('AD||'), title))
        print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'),
                                                      C.yellow(town),
                                                      C.yellow(state),
                                                      C.yellow(country)))
        # TODO: perhaps later we can act more aggressively
        newaddr = [town, '' if state == '?' else state, country]
        if 'address' not in o.json.keys() or newaddr != o.json['address']:
            o.json['address'] = newaddr
            f = open(o.json['FILE'], 'w', encoding='utf-8')
            f.write(o.getJSON())
            f.close()
            return 2
        # nothing changed
        return 0
    print('[ {} ] {}'.format(C.yellow('AD??'), title))
    return 1
Ejemplo n.º 39
0
        else:
            y = v.replace('http://', '').replace('https://', '')
        r = '<a href="{0}">{1}</a>'.format(v, y)
    elif k == 'aka':
        ico = ''
        r = '<br/>'.join(['a.k.a.: “{}”'.format(z) for z in listify(v)])
    else:
        ico = ''
        r = '?{}?{}?'.format(k, v)
    return ico + ' ' + r + '<br/>'


if __name__ == "__main__":
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    ts = sleigh.getTags()
    tagged = []
    for key in ts.keys():
        f = open('{}/tag/{}.html'.format(outputdir, key),
                 'w',
                 encoding='utf-8')
        # papers are displayed in reverse chronological order
        lst = [x.getRestrictedItem(key) for x in \
         sorted(ts[key], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)]
        # no comprehension possible for this case
        for x in ts[key]:
            if x not in tagged:
                tagged.append(x)
Ejemplo n.º 40
0
def report(s, r):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], s))
	return r
Ejemplo n.º 41
0
			v = 'http://'+v
		else:
			y = v.replace('http://', '').replace('https://', '')
		r = '<a href="{0}">{1}</a>'.format(v, y)
	elif k == 'aka':
		ico = ''
		r = '<br/>'.join(['a.k.a.: “{}”'.format(z) for z in listify(v)])
	else:
		ico = ''
		r = '?{}?{}?'.format(k, v)
	return ico + ' ' + r + '<br/>'

if __name__ == "__main__":
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	ts = sleigh.getTags()
	tagged = []
	for key in ts.keys():
		f = open('{}/tag/{}.html'.format(outputdir, key), 'w')
		# papers are displayed in reverse chronological order
		lst = [x.getRestrictedItem(key) for x in \
			sorted(ts[key], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)]
		# no comprehension possible for this case
		for x in ts[key]:
			if x not in tagged:
				tagged.append(x)
		# read tag definition
		tagdef = parseJSON(ienputdir + '/tags/{}.json'.format(key))
Ejemplo n.º 42
0
def checkon(fn, o):
    if not os.path.exists(fn) or os.path.isdir(fn):
        fn = fn + '.json'
    plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
    for k in o.json.keys():
        if 'type' not in o.json.keys():
            print('TERRIBLE', o.getKey())
        if (o.json['type'] == 'proceedings' and k == 'title') or\
           (o.json['type'] == 'inproceedings' and k == 'booktitle'):
            # fix numbers
            for nr in nrs.keys():
                if o.json[k].find(' ' + nr + ' ') > -1:
                    o.json[k] = o.json[k].replace(' ' + nr + ' ',
                                                  ' ' + nrs[nr] + ' ')
        if isinstance(o.json[k], str):
            # add emdashes for fancier titles
            if k in ('title', 'booktitle'):
                o.json[k] = o.json[k].replace(' - ',
                                              ' — ').replace(' -- ', ' — ')
                # Nice heuristic to run from time to time, but reports too much
                # on stuff like “eXtreme” and “jPET”
                # if o.json[k][0].islower():
                # 	print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title')))
            # normalised pages
            if k == 'pages':
                o.json[k] = o.json[k].replace('–', '-').replace('--',
                                                                '-').replace(
                                                                    '−', '-')
            # double spaces
            if o.json[k].find('  ') > -1:
                o.json[k] = o.json[k].replace('  ', ' ').strip()
            # find numeric values, turn them into proper integers
            if o.json[k].isdigit():
                o.json[k] = int(o.json[k])
                continue
            # remove confix curlies
            elif o.json[k].startswith('{') and o.json[k].endswith('}'):
                o.json[k] = o.json[k][1:-1]
            # single quotes to double quotes
            elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1:
                o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ')
            elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"):
                o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"')
            elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"):
                o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"')
            # fancify bland quotes
            elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1:
                o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ')
            elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'):
                o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”')
            elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'):
                o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“')
            # fancify LaTeX quotes
            elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1:
                o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “')
            elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"):
                o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “')
            elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'):
                o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“')
            elif o.json[k].startswith('``') and o.json[k].endswith("''"):
                o.json[k] = '“' + o.json[k][2:-2] + '”'
            # plural possessive
            elif o.json[k].find("'s") > -1:
                o.json[k] = o.json[k].replace("'s", '’s')
            elif o.json[k].find("s' ") > -1:
                o.json[k] = o.json[k].replace("s'", 's’')
            # contractions
            elif o.json[k].find("n't") > -1:
                o.json[k] = o.json[k].replace("n't", 'n’t')
            # the case of "Jr" vs "Jr."
            if k in ('author', 'editor') and o.json[k].endswith('Jr'):
                o.json[k] += '.'
            # TODO: report remaining suspicious activity
            for c in '`"\'':  # ’ is ok
                if c in o.json[k] and k not in ('author', 'editor'):
                    print('[ {} ] {}: {} is “{}”'.format(
                        C.red('LOOK'), o.getKey(), k, o.json[k]))
                    lookat.append(o.filename)
        elif isinstance(o.json[k], list):
            # inline trivial lists
            if len(o.json[k]) == 1:
                o.json[k] = o.json[k][0]
            # inline hidden trivial lists
            if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \
            and k not in ('stemmed', 'tag', 'tagged'):
                o.json[k] = o.json[k][0]
            # unless it’s 'tagged'
            if k == 'tagged' and not isinstance(o.json[k][0], list):
                o.json[k] = [o.json[k]]
            # remove DBLP disambiguation: we might later regret it
            # but the information can be always re-retrieved
            if k in ('author', 'editor'):
                nas = []
                for a in o.json[k]:
                    # double spaces
                    if a.find('  ') > -1:
                        a = a.replace('  ', ' ').strip()
                    ws = a.split(' ')
                    if ws[-1].isdigit():
                        ws = ws[:-1]
                    nas.append(' '.join(ws))
                o.json[k] = nas
                # the case of "Jr" vs "Jr."
                o.json[k] = [
                    a + '.' if a.endswith(' Jr') else a for a in o.json[k]
                ]
    nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
    if plines != nlines:
        f = open(fn, 'w', encoding='utf-8')
        f.write(o.getJSON())
        f.close()
        return 2
    else:
        return 0
Ejemplo n.º 43
0
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r

def two(n):
	if n < 10:
		return '0{}'.format(n)
	else:
		return '{}'.format(n)

if __name__ == "__main__":
	verbose = sys.argv[-1] == '-v'
	peoplez = glob.glob(ienputdir + '/people/*.json')
	print('{}: {} venues, {} papers by {} people\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.red(len(peoplez)),
		C.purple('='*42)))
	cx = {0: 0, 1: 0, 2: 0}
	# stem ALL the papers!
	for v in sleigh.venues:
		for c in v.getConfs():
			for p in c.papers:
				cx[checkreport(p.filename, p, None)] += 1
		for b in v.getBrands():
			cx[checkreport(b.filename, None, b)] += 1
	# write all stems
	listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w)
	f = open(ienputdir + '/stems.json', 'w')
	f.write('[\n\t"' + '",\n\t"'.join(listOfStems) + '"\n]')
Ejemplo n.º 44
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	for k in o.json.keys():
		if 'type' not in o.json.keys():
			print('TERRIBLE',o.getKey())
		if (o.json['type'] == 'proceedings' and k == 'title') or\
		   (o.json['type'] == 'inproceedings' and k == 'booktitle'):
			# fix numbers
			for nr in nrs.keys():
				if o.json[k].find(' '+nr+' ') > -1:
					o.json[k] = o.json[k].replace(' '+nr+' ', ' '+nrs[nr]+' ')
		if isinstance(o.json[k], str):
			# add emdashes for fancier titles
			if k in ('title', 'booktitle'):
				o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ')
				# Nice heuristic to run from time to time, but reports too much
				# on stuff like “eXtreme” and “jPET”
				# if o.json[k][0].islower():
				# 	print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title')))
			# normalised pages
			if k == 'pages':
				o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace('−', '-')
			# double spaces
			if o.json[k].find('  ') > -1:
				o.json[k] = o.json[k].replace('  ', ' ').strip()
			# find numeric values, turn them into proper integers
			if o.json[k].isdigit():
				o.json[k] = int(o.json[k])
				continue
			# remove confix curlies
			elif o.json[k].startswith('{') and o.json[k].endswith('}'):
				o.json[k] = o.json[k][1:-1]
			# single quotes to double quotes
			elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1:
				o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ')
			elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"):
				o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"')
			elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"):
				o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"')
			# fancify bland quotes
			elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1:
				o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ')
			elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'):
				o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”')
			elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'):
				o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“')
			# fancify LaTeX quotes
			elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1:
				o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “')
			elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"):
				o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “')
			elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'):
				o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“')
			elif o.json[k].startswith('``') and o.json[k].endswith("''"):
				o.json[k] = '“' + o.json[k][2:-2] + '”'
			# plural possessive
			elif o.json[k].find("'s") > -1:
				o.json[k] = o.json[k].replace("'s", '’s')
			elif o.json[k].find("s' ") > -1:
				o.json[k] = o.json[k].replace("s'", 's’')
			# contractions
			elif o.json[k].find("n't") > -1:
				o.json[k] = o.json[k].replace("n't", 'n’t')
			# the case of "Jr" vs "Jr."
			if k in ('author', 'editor') and o.json[k].endswith('Jr'):
				o.json[k] += '.'
			# TODO: report remaining suspicious activity
			for c in '`"\'': # ’ is ok
				if c in o.json[k] and k not in ('author', 'editor'):
					print('[ {} ] {}: {} is “{}”'.format(C.red('LOOK'), o.getKey(), k, o.json[k]))
					lookat.append(o.filename)
		elif isinstance(o.json[k], list):
			# inline trivial lists
			if len(o.json[k]) == 1:
				o.json[k] = o.json[k][0]
			# inline hidden trivial lists
			if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \
			and k not in ('stemmed', 'tag', 'tagged'):
				o.json[k] = o.json[k][0]
			# unless it’s 'tagged'
			if k == 'tagged' and not isinstance(o.json[k][0], list):
				o.json[k] = [o.json[k]]
			# remove DBLP disambiguation: we might later regret it
			# but the information can be always re-retrieved
			if k in ('author', 'editor'):
				nas = []
				for a in o.json[k]:
					# double spaces
					if a.find('  ') > -1:
						a = a.replace('  ', ' ').strip()
					ws = a.split(' ')
					if ws[-1].isdigit():
						ws = ws[:-1]
					nas.append(' '.join(ws))
				o.json[k] = nas
				# the case of "Jr" vs "Jr."
				o.json[k] = [a+'.' if a.endswith(' Jr') else a for a in o.json[k]]
	nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if plines != nlines:
		f = open(fn, 'w')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0
Ejemplo n.º 45
0
from fancy.Templates import wordlistHTML, wordHTML
from lib.AST import Sleigh, escape
from lib.JSON import parseJSON
from lib.NLP import ifApproved
from collections import Counter

ienputdir = '../json'
outputdir = '../frontend'
n2f_name = '_name2file.json'
name2file = parseJSON(n2f_name) if os.path.exists(n2f_name) else {}
sleigh = Sleigh(ienputdir + '/corpus', name2file)

if __name__ == "__main__":
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	stems = sleigh.getStems()
	tagged = []
	for k in stems.keys():
		f = open('{}/word/{}.html'.format(outputdir, k), 'w', encoding='utf-8')
		# papers are displayed in reverse chronological order
		lst = [x.getIItem() for x in \
			sorted(stems[k], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)]
		# collect other stems
		# NB: do not use the following code, slows everything down from 1 minute to 161 minutes
		# allstems = []
		# for x in stems[k]:
		# 	allstems += x.getBareStems()
		# siblings = {stem:allstems.count(stem) for stem in allstems if stem != k and ifApproved(stem)}
Ejemplo n.º 46
0
def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


if __name__ == "__main__":
    if len(sys.argv) > 1:
        verbose = sys.argv[1] == '-v'
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    # read the CSV
    f = open('scrap-committees/scraped-by-grammarware.csv',
             'r',
             encoding='utf-8')
    # CBSE;2001;Heinz;Schmidt;;Organising Committee
    for line in f.readlines():
        vs = line.strip().split(';')
        if len(vs) == 0:
            continue
        v = vs[0] + '-' + vs[1]
        n = vs[2] + ' ' + vs[3]
        # normalise!
        if n in renameto.keys():
Ejemplo n.º 47
0
        else:
            paperPdf = ''
        paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\
         'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\
         'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\
         'author': paperAuths, 'pages': paperPages, 'venue': volVenue}
        if paperPdf:
            paperEntry['openpdf'] = paperPdf
        if paperLnk:
            paperEntry['url'] = urlstart + '#' + paperLnk
        paperFilename = lastSlash(outputdir) + '-' + paperAuths[0].split(
            ' ')[-1]
        for a in paperAuths[1:]:
            print(a)
            paperFilename += a.split(' ')[-1][0]
        if paperFilename in done:
            paperFilename += 'a'
            while paperFilename in done:
                paperFilename = paperFilename[:-1] + chr(
                    ord(paperFilename[-1]) + 1)
        # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json')
        f = open(outputdir + '/' + paperFilename + '.json',
                 'w',
                 encoding='utf-8')
        f.write(jsonify(paperEntry))
        f.close()
        cx += 1
        done.append(paperFilename)
    print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx),
          'papers.')
Ejemplo n.º 48
0
def report(one, two):
	print('[ {} ] {}'.format(one, two))

def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		report(statuses[r], fn)
	return r

if __name__ == "__main__":
	if len(sys.argv) > 1:
		verbose = sys.argv[1] == '-v'
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	cx = {0: 0, 1: 0, 2: 0}
	for v in sleigh.venues:
		for c in v.getConfs():
			cx[checkreport(c.filename, c)] += 1
			for p in c.papers:
				cx[checkreport(p.filename, p)] += 1
	print('{} files checked, {} ok, {} fixed, {} failed'.format(\
		C.bold(cx[0] + cx[1] + cx[2]),
		C.blue(cx[0]),
		C.yellow(cx[2]),
		C.red(cx[1])))
Ejemplo n.º 49
0
def report(fn1, fn2, r):
	statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
	return r
Ejemplo n.º 50
0
def checkon(fn, o):
    if not os.path.exists(fn) or os.path.isdir(fn):
        fn = fn + '.json'
    f = open(fn, 'r', encoding='utf-8')
    lines = f.readlines()[1:-1]
    f.close()
    flines = json2lines(lines)
    plines = sorted(json2lines(o.getJSON().split('\n')))
    # "url" from DBLP are useless
    if 'url' in o.json.keys():
        o.json['url'] = [link.replace('https://', 'http://')\
            for link in listify(o.json['url'])\
             if not link.startswith('db/conf/')\
             and not link.startswith('db/series/')\
             and not link.startswith('db/books/')\
            and not link.startswith('db/journals/')]
        if not o.json['url']:
            del o.json['url']
        elif len(o.json['url']) == 1:
            o.json['url'] = o.json['url'][0]
    if 'ee' in o.json.keys() and 'doi' not in o.json.keys():
        if isinstance(o.json['ee'], list):
            if verbose:
                print(C.red('Manylink:'), o.json['ee'])
        newee = []
        for onelink in listify(o.json['ee']):
            if onelink.startswith('http://dx.doi.org/'):
                o.json['doi'] = onelink[18:]
            elif onelink.startswith('http://doi.acm.org/'):
                o.json['doi'] = onelink[19:]
            elif onelink.startswith('http://doi.ieeecomputersociety.org/'):
                o.json['doi'] = onelink[35:]
            elif onelink.startswith('http://dl.acm.org/citation.cfm?id='):
                o.json['acmid'] = onelink[34:]
            elif onelink.startswith('http://portal.acm.org/citation.cfm?id='):
                o.json['acmid'] = onelink[38:]
            elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\
              or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='):
                o.json['ieeearid'] = onelink.split('=')[-1]
            elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\
             and onelink.find('arnumber') > -1:
                o.json['ieeearid'] = onelink.split('arnumber=')[-1].split(
                    '&')[0]
            elif onelink.startswith(
                    'http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber='
            ):
                o.json['ieeepuid'] = onelink.split('=')[-1]
            elif onelink.startswith(
                    'http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='):
                o.json['ieeeisid'] = onelink.split('=')[-1]
            elif onelink.startswith(
                    'http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/'
            ):
                newee.append(
                    'http://journal.ub.tu-berlin.de/eceasst/article/view/' +
                    onelink.split('/')[-1])
            elif onelink.endswith('.pdf') and \
                (onelink.startswith('http://computer.org/proceedings/')\
              or onelink.startswith('http://csdl.computer.org/')):
                # Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf
                # Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf
                # Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf
                if onelink.startswith('http://csdl'):
                    cname, _, cid, mid, pid = onelink.split('/')[5:10]
                else:
                    cname, cid, pid = onelink.split('/')[4:7]
                    # heuristic
                    if pid.startswith(cid):
                        mid = pid[len(cid):len(cid) + 2]
                    else:
                        mid = '00'
                newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\
                 cname,
                 o.get('year'),
                 cid,
                 mid,
                 pid))
            else:
                if onelink.find('ieee') > -1:
                    print(C.purple('IEEE'), onelink)
                if verbose:
                    print(C.yellow('Missed opportunity:'), onelink)
                # nothing matches => preserve
                newee.append(onelink)
        if len(newee) == 0:
            del o.json['ee']
        elif len(newee) == 1:
            o.json['ee'] = newee[0]
        else:
            o.json['ee'] = newee
        # post-processing normalisation
        if 'acmid' in o.json.keys() and not isinstance(
                o.json['acmid'], int) and o.json['acmid'].isdigit():
            o.json['acmid'] = int(o.json['acmid'])
    if 'eventuri' in o.json.keys():
        o.json['eventurl'] = o.json['eventuri']
        del o.json['eventuri']
    if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith(
            'https://'):
        o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://')
    nlines = sorted(json2lines(o.getJSON().split('\n')))
    if flines != plines:
        return 1
    elif plines != nlines:
        f = open(fn, 'w', encoding='utf-8')
        f.write(o.getJSON())
        f.close()
        return 2
    else:
        return 0
Ejemplo n.º 51
0
		return 0

def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r

if __name__ == "__main__":
	if len(sys.argv) > 1:
		verbose = sys.argv[1] == '-v'
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	# read the CSV
	f = open('scrap-committees/scraped-by-grammarware.csv', 'r')
	# CBSE;2001;Heinz;Schmidt;;Organising Committee
	for line in f.readlines():
		vs = line.strip().split(';')
		if len(vs) == 0:
			continue
		v = vs[0] + '-' + vs[1]
		n = vs[2] + ' ' + vs[3]
		# normalise!
		if n in renameto.keys():
			print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n])
			n = renameto[n]
Ejemplo n.º 52
0
def linkto(n):
	if n in name2file:
		return '<a href="{}">{}</a>'.format(name2file[n], shorten(n))
	else:
		return n

def pad(n):
	X = str(n)
	while len(X) < 4:
		X = '0' + X
	return X

if __name__ == "__main__":
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	ps = []
	# flatten the sleigh
	bykey = {}
	for v in sleigh.venues:
		bykey[v.getKey()] = v
		for c in v.getConfs():
			bykey[c.getKey()] = c
			for p in c.papers:
				bykey[p.getKey()] = p
	print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey))))
	# tagged = []
	# for k in ts.keys():
	peoples = {}
Ejemplo n.º 53
0
def checkon(fn, o):
	if os.path.isdir(fn):
		fn = fn + '.json'
	f = open(fn, 'r')
	lines = f.readlines()[1:-1]
	f.close()
	flines = [strictstrip(s) for s in lines]
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if flines != plines:
		return 1
	ts = []
	# precise case-sensitive match
	mcs = o.get('title')
	# precise match for substrings
	mes = baretext(mcs)
	# precise match for words
	mew = mes.split(' ')
	# imprecise match for substrings
	mis = superbaretext(mes)
	# imprecise match for words
	miw = mis.split(' ')
	# now match!
	for t in tags:
		# print('Check',t,'vs',mes)
		if 'name' not in t.keys():
			print(C.red('ERROR:'), 'no name for tag from file', t['FILE'])
			continue
		if all([not k.startswith('match') for k in t.keys()]):
			print(C.red('ERROR:'), 'no match rules for tag', t['name'])
			continue
		for k in t.keys():
			if k == 'matchentry':
				if o.getKey() in t[k]:
					ts += [t['name']]
			elif k.startswith('match'):
				ts += [t['name'] for s in listify(t[k]) if matchModes[k](s, mcs, mes, mew, mis, miw)]
				# ts += [t['name'] for s in listify(t[k]) if fmm(t, k, s, mcs, mes, mew, mis, miw)]
	# second pass: check reliefs
	for t in tags:
		if 'relieves' in t.keys():
			for r in listify(t['relieves']):
				if t['name'] in ts and r in ts:
					ts.remove(r)
					if t['name'] not in relieved.keys():
						relieved[t['name']] = 0
					relieved[t['name']] += 1
	if ts:
		if not o.tags:
			o.tags = []
		for t in ts:
			if t not in o.tags:
				o.tags.append(t)
	# uncomment the following one line to overwrite all tags
	o.tags = uniq(ts)
	# let’s keep tags clean and sorted
	o.tags = sorted(o.tags)
	nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if plines != nlines:
		f = open(fn, 'w')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0
Ejemplo n.º 54
0
        return '<a href="{}">{}</a>'.format(name2file[n], shorten(n))
    else:
        return n


def pad(n):
    X = str(n)
    while len(X) < 4:
        X = '0' + X
    return X


if __name__ == "__main__":
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    ps = []
    # flatten the sleigh
    bykey = {}
    for v in sleigh.venues:
        bykey[v.getKey()] = v
        for c in v.getConfs():
            bykey[c.getKey()] = c
            for p in c.papers:
                bykey[p.getKey()] = p
    print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey))))
    # tagged = []
    # for k in ts.keys():
    peoples = {}
Ejemplo n.º 55
0

def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    for v in sleigh.venues:
        # tags per venue
        for c in v.getConfs():
            cx[checkreport(c.filename, c)] += 1
        for b in v.brands:
            cx[checkreport(b.filename, b)] += 1
        cx[checkreport(v.filename, v)] += 1
    print('{} files checked, {} ok, {} fixed, {} failed'.format(\
     C.bold(cx[0] + cx[1] + cx[2]),
     C.blue(cx[0]),
     C.yellow(cx[2]),
     C.red(cx[1])))
Ejemplo n.º 56
0
				people.update(listify(c.json['editor']))
			for p in c.papers:
				if 'author' in p.json:
					people.update(listify(p.json['author']))
	for a in people:
		for na in (nodiaLatin(a), simpleLatin(a)):
			if na != a:
				aka.setdefault(a, [])
				aka[a].append(na)
	# invert aliasing
	for akey in aka:
		if akey in ('ZZZZZZZZZZ', 'FILE'):
			continue
		for aval in aka[akey]:
			renameto[aval] = akey
	f = open('_renameto.json', 'w', encoding='utf-8')
	f.write(json.dumps(renameto, sort_keys=True, separators=(',\n\t', ': '), ensure_ascii=False))
	f.close()
	cx = {0: 0, 1: 0, 2: 0}
	for v in sleigh.venues:
		for c in v.getConfs():
			cx[checkreport(c.filename, c)] += 1
			for p in c.papers:
				cx[checkreport(p.filename, p)] += 1
	print('{} aliasing rules, {} of them manual.'.format(len(renameto), CX))
	print('{} files checked, {} ok, {} fixed, {} failed'.format(\
		C.bold(cx[0] + cx[1] + cx[2]),
		C.blue(cx[0]),
		C.yellow(cx[2]),
		C.red(cx[1])))
Ejemplo n.º 57
0
    return r


def two(n):
    if n < 10:
        return '0{}'.format(n)
    else:
        return '{}'.format(n)


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    peoplez = glob.glob(ienputdir + '/people/*.json')
    print('{}: {} venues, {} papers by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(peoplez)),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    # stem ALL the papers!
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                cx[checkreport(p.filename, p, None)] += 1
        for b in v.getBrands():
            cx[checkreport(b.filename, None, b)] += 1
    # write all stems
    listOfStems = sorted(filter(ifApproved, ALLSTEMS),
                         key=lambda w: two(len(w)) + w)
    f = open(ienputdir + '/stems.json', 'w', encoding='utf-8')
Ejemplo n.º 58
0
	f = open('../conferenceMetrics/data/SE-conf-roles.csv', 'r')
	for line in f.readlines():
		# Conference;Year;First Name;Last Name;Sex;Role
		csv.append(line.strip().split(';'))
	f.close()
	f = open('scrap-committees/scraped-by-grammarware.csv', 'r')
	for line in f.readlines():
		csv.append(line.strip().split(';'))
	f.close()
	# All known contributors
	people = {}
	for fn in glob.glob(ienputdir + '/people/*.json'):
		p = parseJSON(fn)
		# people.append(p)
		if 'name' not in p.keys():
			print('[', C.red('NOGO'), ']', 'No name in', fn)
			continue
		people[p['name']] = p
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	# All people who ever contributed
	names = []
	for v in sleigh.venues:
		for c in v.getConfs():
			for p in c.papers:
				for k in ('author', 'editor'):
					if k in p.json.keys():
						names += [a for a in listify(p.json[k]) if a not in names]
Ejemplo n.º 59
0
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], simpleLatin(s)))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    # All known contributors
    cx = {0: 0, 1: 0, 2: 0}
    people = {}
    for fn in glob.glob(ienputdir + '/people/*.json'):
        p = parseJSON(fn)
        if p['name'] in people.keys():
            cx[report(C.red('duplicate') + ' ' + C.yellow(p), 1)] += 1
            continue
        people[p['name']] = p
    print('{}: {} venues, {} papers written by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(people)),
     C.purple('='*42)))
    # traverse ALL the papers!
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                if 'author' in p.json.keys():
                    for a in listify(p.json['author']):
                        if a in people.keys():
Ejemplo n.º 60
0
def report(fn1, fn2, r):
    statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
    return r