Esempio n. 1
0
def report(fn, r):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX'))
    special = ('', '- no crossref found!', '- illegal crossref')
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {} {}'.format(statuses[r], fn, special[r]))
    return r
Esempio n. 2
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if not os.path.exists(fn):
		# if it still does not exist, let us create a minimal one
		f = open(fn, 'w')
		f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\
			name=fn.split('/')[-1][:-5].replace('-', ' '),
			year=findYear(fn.split('/')[-1])\
		))
		f.close()
		print('[ {} ] {}'.format(C.yellow('MADE'), fn))
		return 2
	f = open(fn, 'r')
	lines = f.readlines()[1:-1]
	f.close()
	for line in lines:
		if line.find('"year"') > -1 and findYear(line) > 3000:
			os.remove(fn)
			print('[ {} ] {}'.format(C.red('KILL'), fn))
			return 1
	flines = sorted([strictstrip(s) for s in lines])
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if flines != plines:
		f1 = [line for line in flines if line not in plines]
		f2 = [line for line in plines if line not in flines]
		print('∆:', f1, '\nvs', f2)
	if flines == plines:
		return 0
	else:
		return 1
Esempio n. 3
0
def report(fn, r):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('UNEX'))
	special = ('', '- no crossref found!', '- illegal crossref')
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {} {}'.format(statuses[r], fn, special[r]))
	return r
Esempio n. 4
0
def checkreport(m, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(m, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], o.filename))
	return r
Esempio n. 5
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		report(statuses[r], fn)
	return r
Esempio n. 6
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if not os.path.exists(fn):
		# if it still does not exist, let us create a minimal one
		f = open(fn, 'w', encoding='utf-8')
		f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\
			name=lastSlash(fn)[:-5].replace('-', ' '),
			year=findYear(lastSlash(fn))\
		))
		f.close()
		print('[ {} ] {}'.format(C.yellow('MADE'), fn))
		return 2
	f = open(fn, 'r', encoding='utf-8')
	lines = f.readlines()[1:-1]
	f.close()
	for line in lines:
		if line.find('"year"') > -1 and findYear(line) > 3000:
			os.remove(fn)
			print('[ {} ] {}'.format(C.red('KILL'), fn))
			return 1
	flines = sorted([strictstrip(s) for s in lines])
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if flines != plines:
		f1 = [line for line in flines if line not in plines]
		f2 = [line for line in plines if line not in flines]
		print('∆:', f1, '\nvs', f2)
	if flines == plines:
		return 0
	else:
		return 1
Esempio n. 7
0
def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r
Esempio n. 8
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('WARN'))
	r, msg = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}: {}'.format(statuses[r], fn, msg))
	return r
Esempio n. 9
0
def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	if isinstance(o, int):
		r = o
	else:
		r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r
Esempio n. 10
0
def checkreport(fn, o, br):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	if br:
		r = checkbrand(fn, br)
	else:
		r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r
Esempio n. 11
0
def checkon(m, o):
	# if no common model found, we failed
	if not m:
		return 1
	if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'):
		m['type'] = 'proceedings'
	if 'type' in m.keys() and m['type'] == 'incollection':
		m['type'] = 'book'
	if 'crossref' in m.keys():
		del m['crossref']
	if 'booktitle' in m.keys():
		m['title'] = m['booktitle']
		del m['booktitle']
	if 'booktitleshort' in m.keys():
		# TODO: ???
		del m['booktitleshort']
	r = 0
	n = {}
	for k in m.keys():
		if o.get(k) == m[k]:
			if verbose:
				print(C.blue('Confirmed:  '), k, 'as', m[k])
		else:
			if verbose:
				print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k))
			v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k]
			if verbose:
				print(C.yellow('Settled for:'), v)
			n[k] = v
			r = 2
	if r == 0:
		return r
	if r == 2 and not n:
		# nothing to fix?!
		return 0
	if not os.path.exists(o.filename):
		return 0
	if os.path.isdir(o.filename):
		fn = o.filename + '.json'
	else:
		fn = o.filename
	if os.path.exists(fn):
		f = open(fn, 'r', encoding='utf-8')
		lines = f.read()
		f.close()
		if lines != o.getJSON():
			# strange, should be equal (run all normalisers first!)
			return 1
	for k in n.keys():
		o.json[k] = n[k]
	f = open(fn, 'w', encoding='utf-8')
	f.write(o.getJSON())
	f.close()
	return 2
Esempio n. 12
0
def checkon(m, o):
	# if no common model found, we failed
	if not m:
		return 1
	if 'type' in m.keys() and m['type'] in ('inproceedings', 'article'):
		m['type'] = 'proceedings'
	if 'type' in m.keys() and m['type'] == 'incollection':
		m['type'] = 'book'
	if 'crossref' in m.keys():
		del m['crossref']
	if 'booktitle' in m.keys():
		m['title'] = m['booktitle']
		del m['booktitle']
	if 'booktitleshort' in m.keys():
		# TODO: ???
		del m['booktitleshort']
	r = 0
	n = {}
	for k in m.keys():
		if o.get(k) == m[k]:
			if verbose:
				print(C.blue('Confirmed:  '), k, 'as', m[k])
		else:
			if verbose:
				print(C.red('Conflicted: '), k, 'as', m[k], 'vs', o.get(k))
			v = heurichoose(k, m[k], o.json[k]) if k in o.json.keys() else m[k]
			if verbose:
				print(C.yellow('Settled for:'), v)
			n[k] = v
			r = 2
	if r == 0:
		return r
	if r == 2 and not n:
		# nothing to fix?!
		return 0
	if not os.path.exists(o.filename):
		return 0
	if os.path.isdir(o.filename):
		fn = o.filename + '.json'
	else:
		fn = o.filename
	if os.path.exists(fn):
		f = open(fn, 'r')
		lines = f.read()
		f.close()
		if lines != o.getJSON():
			# strange, should be equal (run all normalisers first!)
			return 1
	for k in n.keys():
		o.json[k] = n[k]
	f = open(fn, 'w')
	f.write(o.getJSON())
	f.close()
	return 2
Esempio n. 13
0
	def __init__(self, d, hdir, name2file, parent):
		super(Venue, self).__init__(d, hdir)
		self.years = []
		self.brands = []
		self.n2f = name2file
		if os.path.exists(d+'.json'):
			# new style
			# print(C.blue(d), 'is new style')
			self.json = parseJSON(d+'.json')
		else:
			# legacy style
			print(C.red(d), 'is legacy style')
			self.json = []
		for f in glob.glob(d+'/*.json'):
			if not self.json:
				self.json = parseJSON(f)
			else:
				self.brands.append(Brand(f, self.homedir, name2file, self))
		for f in glob.glob(d+'/*'):
			if f.endswith('.json'):
				# already processed
				continue
			elif os.path.isdir(f):
				y = Year(f, self.homedir, name2file, self)
				self.years.append(y)
				for b in self.brands:
					for c in y.confs:
						b.offer(y.year, c)
			else:
				print('File out of place:', f)
		self.back = parent
Esempio n. 14
0
 def __init__(self, d, hdir, name2file, parent):
     super(Venue, self).__init__(d, hdir)
     self.years = []
     self.brands = []
     self.n2f = name2file
     if os.path.exists(d + '.json'):
         # new style
         # print(C.blue(d), 'is new style')
         self.json = parseJSON(d + '.json')
     else:
         # legacy style
         print(C.red(d), 'is legacy style')
         self.json = {}
     for f in glob.glob(d + '/*.json'):
         if not self.json:
             self.json = parseJSON(f)
         else:
             self.brands.append(Brand(f, self.homedir, name2file, self))
     for f in glob.glob(d + '/*'):
         if f.endswith('.json'):
             # already processed
             continue
         elif os.path.isdir(f):
             y = Year(f, self.homedir, name2file, self)
             self.years.append(y)
             for b in self.brands:
                 for c in y.confs:
                     b.offer(y.year, c)
         else:
             print('File out of place:', f)
     self.back = parent
Esempio n. 15
0
def processSortedRel(r):
	# [ {"x" : Y } ] where Y can be a string or a sorted rel
	global pcx
	acc = []
	for el in r:
		ename = list(el.keys())[0]
		evals = el[ename]
		if os.path.isfile(outputdir + '/stuff/' + ename.lower() + '.png'):
			img = '<img src="../stuff/{1}.png" alt="{0}" width="30px"/> '.format(ename, ename.lower())
		else:
			img = ''
		if isinstance(evals, str):
			plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages)
			pcx += len(plst)
			ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>'
		elif isinstance(evals, list) and isinstance(evals[0], str):
			plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages)
			pcx += len(plst)
			ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>'
		elif isinstance(evals, list) and isinstance(evals[0], dict):
			ptxt = processSortedRel(evals)
		else:
			print(C.red('ERROR:'), 'unrecornised bundle structure', evals)
		acc.append('<dl><dt>{}{}</dt><dd>{}</dl>'.format(img, ename, ptxt))
	return '\n'.join(acc)
Esempio n. 16
0
	def __init__(self, idir, name2file):
		super(Sleigh, self).__init__('', idir)
		self.venues = []
		self.n2f = name2file
		jsons = {}
		skip4Now = []
		for d in glob.glob(idir+'/*.json'):
			if d.split('/')[-1].split('.')[0] in skip4Now:
				print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now'))
				continue
			jsons[d.split('/')[-1].split('.')[0]] = d
		for d in glob.glob(idir+'/*'):
			cont = False
			for end in ('.md', '.json', '/frem', '/edif'):
				if d.endswith(end):
					cont = True
			if d.split('/')[-1] in skip4Now:
				print(C.red('Skipping') + ' ' + C.purple(d) + ' ' + C.red('for now'))
				cont = True
			if cont:
				continue
			if d.split('/')[-1] not in jsons.keys():
				print(C.red('Legacy non-top definition of'), d)
				self.venues.append(Venue(d, idir, name2file, self))
			else:
				self.venues.append(Venue(d, idir, name2file, self))
Esempio n. 17
0
 def __init__(self, idir, name2file):
     super(Sleigh, self).__init__('', idir)
     self.venues = []
     self.n2f = name2file
     jsons = {}
     skip4Now = []
     for d in glob.glob(idir + '/*.json'):
         if lastSlash(d).split('.')[0] in skip4Now:
             print(
                 C.red('Skipping') + ' ' + C.purple(d) + ' ' +
                 C.red('for now'))
             continue
         jsons[lastSlash(d).split('.')[0]] = d
     for d in glob.glob(idir + '/*'):
         cont = False
         for end in ('.md', '.json', '/frem', '/edif'):
             if d.endswith(end):
                 cont = True
         if d.split('/')[-1] in skip4Now:
             print(
                 C.red('Skipping') + ' ' + C.purple(d) + ' ' +
                 C.red('for now'))
             cont = True
         if cont:
             continue
         if lastSlash(d) not in jsons.keys():
             print(C.red('Legacy non-top definition of'), d)
             if lastSlash(d) not in ('edif', 'frem'):
                 self.venues.append(Venue(d, idir, name2file, self))
         else:
             self.venues.append(Venue(d, idir, name2file, self))
Esempio n. 18
0
def parseJSON(fn):
	# print('Parsing',fn,'...')
	try:
		j = json.load(open(fn, 'r', encoding='utf-8'))
		j['FILE'] = fn
		return j
	except ValueError:
		print(C.red('JSON parse error'), 'on', fn.replace('\\', '/'))
		return {}
Esempio n. 19
0
def parseJSON(fn):
	# print('Parsing',fn,'...')
	try:
		j = json.load(open(fn, 'r'))
		j['FILE'] = fn
		return j
	except ValueError:
		print(C.red('JSON parse error'), 'on', fn)
		return {}
Esempio n. 20
0
def guessYear(P):
    cys = [int(w) for w in P.split('-') if len(w) == 4 and w.isdigit()]
    if len(cys) == 1:
        return cys[0]
    else:
        j = sleigh.seekByKey(P)
        if 'year' in j.json.keys():
            return j.get('year')
        elif 'year' in dir(j):
            return j.year
        else:
            print('[ {} ] {}'.format(C.red('YEAR'), P))
            return 0
Esempio n. 21
0
def guessYear(p):
	cys = [int(w) for w in p.split('-') if len(w) == 4 and w.isdigit()]
	if len(cys) == 1:
		return cys[0]
	else:
		j = sleigh.seekByKey(p)
		if 'year' in j.json.keys():
			return j.get('year')
		elif 'year' in dir(j):
			return j.year
		else:
			print('[ {} ] {}'.format(C.red('YEAR'), p))
			return 0
Esempio n. 22
0
def sortbypages(z):
	if 'pages' not in z.json.keys():
		print(C.red('No pages at all in '+z.getKey()))
		return 0
	p1, _ = z.getPagesTuple()
	y = z.get('year')
	if isinstance(y, str):
		# non-correcting robustness
		return 0
	# a trick to have several volumes within one conference
	v = z.get('volume')
	if isinstance(v, int) or v.isdigit():
		y += int(v)
	return y + p1 / 10000. if p1 else y
Esempio n. 23
0
def sortbypages(z):
    if 'pages' not in z.json.keys():
        print(C.red('No pages at all in ' + z.getKey()))
        return 0
    p1, _ = z.getPagesTuple()
    y = z.get('year')
    if isinstance(y, str):
        # non-correcting robustness
        return 0
    # a trick to have several volumes within one conference
    v = z.get('volume')
    if isinstance(v, int) or v.isdigit():
        y += int(v)
    return y + p1 / 10000. if p1 else y
Esempio n. 24
0
def checkon(fn, o):
    if not os.path.exists(fn) or os.path.isdir(fn):
        fn = fn + '.json'
    if 'title' not in o.json.keys():
        if verbose:
            print('No title in', o.getKey())
        return 1  # no title
    # check for a different language - to avoid stemming altogether
    if o.tags and ('german' in o.tags or 'french' in o.tags
                   or 'portuguese' in o.tags):
        if 'stemmed' in o.json.keys():
            # if stemmed before marked foreign, remove this info
            del o.json['stemmed']
            F = open(fn, 'w', encoding='utf-8')
            F.write(o.getJSON())
            F.close()
            return 2
        else:
            return 0
    changed = False
    ### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
    stemmer = snowballstemmer.stemmer('english').stemWords
    ### disregarded variant: snowballstemmer porter - considered outdated
    # stemmer = snowballstemmer.stemmer('porter').stemWords
    ### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
    # stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
    ### disregarded variant: nltk - worse on verbs ending with -ze
    # stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
    ### end variants
    stemmed = stemmer(string2words(o.get('title')))
    if '' in stemmed:
        print('“{}” is a title of {} and it has an empty word'.format(
            o.get('title'), C.red(o.getKey())))
        print(string2words(o.get('title')))
        print(stemmer(string2words(o.get('title'))))
    ALLSTEMS.update(stemmed)
    if o.get('stemmed') != stemmed:
        o.json['stemmed'] = stemmed
        changed = True
    if changed:
        F = open(fn, 'w', encoding='utf-8')
        F.write(o.getJSON())
        F.close()
        return 2
    else:
        return 0
Esempio n. 25
0
def dblpify(s):
	# http://dblp.uni-trier.de/pers/hd/e/Elbaum:Sebastian_G=
	if s in dis.keys():
		return dis[s]
	if s.find(' ') < 0:
		print('[', C.red('NAME'), ']', 'Unconventional full name:', s)
		cx[1] += 1
		return dblpLatin(s)+':'
	ws = s.split(' ')
	i = -1
	if ws[i] in ('Jr', 'Jr.'):
		i -= 1
	sur = dblpLatin(' '.join(ws[i:]))
	rest = dblpLatin(' '.join(ws[:i])).replace(' ', '_')
	for c in ".'-":
		rest = rest.replace(c, '=')
	return sur+':'+rest
Esempio n. 26
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if 'title' not in o.json.keys():
		if verbose:
			print('No title in', o.getKey())
		return 1 # no title
	# check for a different language - to avoid stemming altogether
	if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags):
		if 'stemmed' in o.json.keys():
			# if stemmed before marked foreign, remove this info
			del o.json['stemmed']
			F = open(fn, 'w')
			F.write(o.getJSON())
			F.close()
			return 2
		else:
			return 0
	changed = False
	### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
	stemmer = snowballstemmer.stemmer('english').stemWords
	### disregarded variant: snowballstemmer porter - considered outdated
	# stemmer = snowballstemmer.stemmer('porter').stemWords
	### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
	# stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
	### disregarded variant: nltk - worse on verbs ending with -ze
	# stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
	### end variants
	stemmed = stemmer(string2words(o.get('title')))
	if '' in stemmed:
		print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey())))
		print(string2words(o.get('title')))
		print(stemmer(string2words(o.get('title'))))
	ALLSTEMS.update(stemmed)
	if o.get('stemmed') != stemmed:
		o.json['stemmed'] = stemmed
		changed = True
	if changed:
		F = open(fn, 'w')
		F.write(o.getJSON())
		F.close()
		return 2
	else:
		return 0
Esempio n. 27
0
 def seekByKey(self, key):
     f = None
     # trying a shortcut
     hv = key.split('-')[0]
     for v in self.venues:
         if v.getKey() == hv:
             # print('\tShortcut to', hv)
             f = v.seekByKey(key)
             if f:
                 return f
             # else:
             # 	print('\t', C.red('...failed'))
     # bruteforce search
     # print('\tBrute force searching for', key)
     for v in self.venues:
         f = v.seekByKey(key)
         if f:
             return f
     print(C.red(key), ' not found in BibSLEIGH!')
     return f
Esempio n. 28
0
def checkon(fn, o):
    if not os.path.exists(fn) or os.path.isdir(fn):
        fn = fn + '.json'
    f = open(fn, 'r', encoding='utf-8')
    lines = f.readlines()[1:-1]
    f.close()
    flines = json2lines(lines)
    plines = sorted(json2lines(o.getJSON().split('\n')))
    # "url" from DBLP are useless
    if 'url' in o.json.keys():
        o.json['url'] = [link.replace('https://', 'http://')\
            for link in listify(o.json['url'])\
             if not link.startswith('db/conf/')\
             and not link.startswith('db/series/')\
             and not link.startswith('db/books/')\
            and not link.startswith('db/journals/')]
        if not o.json['url']:
            del o.json['url']
        elif len(o.json['url']) == 1:
            o.json['url'] = o.json['url'][0]
    if 'ee' in o.json.keys() and 'doi' not in o.json.keys():
        if isinstance(o.json['ee'], list):
            if verbose:
                print(C.red('Manylink:'), o.json['ee'])
        newee = []
        for onelink in listify(o.json['ee']):
            if onelink.startswith('http://dx.doi.org/'):
                o.json['doi'] = onelink[18:]
            elif onelink.startswith('http://doi.acm.org/'):
                o.json['doi'] = onelink[19:]
            elif onelink.startswith('http://doi.ieeecomputersociety.org/'):
                o.json['doi'] = onelink[35:]
            elif onelink.startswith('http://dl.acm.org/citation.cfm?id='):
                o.json['acmid'] = onelink[34:]
            elif onelink.startswith('http://portal.acm.org/citation.cfm?id='):
                o.json['acmid'] = onelink[38:]
            elif onelink.startswith('http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=')\
              or onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber='):
                o.json['ieeearid'] = onelink.split('=')[-1]
            elif onelink.startswith('http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=')\
             and onelink.find('arnumber') > -1:
                o.json['ieeearid'] = onelink.split('arnumber=')[-1].split(
                    '&')[0]
            elif onelink.startswith(
                    'http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber='
            ):
                o.json['ieeepuid'] = onelink.split('=')[-1]
            elif onelink.startswith(
                    'http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber='):
                o.json['ieeeisid'] = onelink.split('=')[-1]
            elif onelink.startswith(
                    'http://eceasst.cs.tu-berlin.de/index.php/eceasst/article/view/'
            ):
                newee.append(
                    'http://journal.ub.tu-berlin.de/eceasst/article/view/' +
                    onelink.split('/')[-1])
            elif onelink.endswith('.pdf') and \
                (onelink.startswith('http://computer.org/proceedings/')\
              or onelink.startswith('http://csdl.computer.org/')):
                # Bad: http://computer.org/proceedings/icsm/1189/11890007.pdf
                # Bad: http://csdl.computer.org/comp/proceedings/date/2003/1870/02/187020040.pdf
                # Good: http://www.computer.org/csdl/proceedings/icsm/2001/1189/00/11890004.pdf
                if onelink.startswith('http://csdl'):
                    cname, _, cid, mid, pid = onelink.split('/')[5:10]
                else:
                    cname, cid, pid = onelink.split('/')[4:7]
                    # heuristic
                    if pid.startswith(cid):
                        mid = pid[len(cid):len(cid) + 2]
                    else:
                        mid = '00'
                newee.append('http://www.computer.org/csdl/proceedings/{}/{}/{}/{}/{}'.format(\
                 cname,
                 o.get('year'),
                 cid,
                 mid,
                 pid))
            else:
                if onelink.find('ieee') > -1:
                    print(C.purple('IEEE'), onelink)
                if verbose:
                    print(C.yellow('Missed opportunity:'), onelink)
                # nothing matches => preserve
                newee.append(onelink)
        if len(newee) == 0:
            del o.json['ee']
        elif len(newee) == 1:
            o.json['ee'] = newee[0]
        else:
            o.json['ee'] = newee
        # post-processing normalisation
        if 'acmid' in o.json.keys() and not isinstance(
                o.json['acmid'], int) and o.json['acmid'].isdigit():
            o.json['acmid'] = int(o.json['acmid'])
    if 'eventuri' in o.json.keys():
        o.json['eventurl'] = o.json['eventuri']
        del o.json['eventuri']
    if 'eventurl' in o.json.keys() and o.json['eventurl'].startswith(
            'https://'):
        o.json['eventurl'] = o.json['eventurl'].replace('https://', 'http://')
    nlines = sorted(json2lines(o.getJSON().split('\n')))
    if flines != plines:
        return 1
    elif plines != nlines:
        f = open(fn, 'w', encoding='utf-8')
        f.write(o.getJSON())
        f.close()
        return 2
    else:
        return 0
Esempio n. 29
0
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r

def two(n):
	if n < 10:
		return '0{}'.format(n)
	else:
		return '{}'.format(n)

if __name__ == "__main__":
	verbose = sys.argv[-1] == '-v'
	peoplez = glob.glob(ienputdir + '/people/*.json')
	print('{}: {} venues, {} papers by {} people\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.red(len(peoplez)),
		C.purple('='*42)))
	cx = {0: 0, 1: 0, 2: 0}
	# stem ALL the papers!
	for v in sleigh.venues:
		for c in v.getConfs():
			for p in c.papers:
				cx[checkreport(p.filename, p, None)] += 1
		for b in v.getBrands():
			cx[checkreport(b.filename, None, b)] += 1
	# write all stems
	listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w)
	f = open(ienputdir + '/stems.json', 'w')
Esempio n. 30
0
	else:
		return 0

def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r

if __name__ == "__main__":
	if len(sys.argv) > 1:
		verbose = sys.argv[1] == '-v'
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	# read the CSV
	f = open('scrap-committees/scraped-by-grammarware.csv', 'r')
	# CBSE;2001;Heinz;Schmidt;;Organising Committee
	for line in f.readlines():
		vs = line.strip().split(';')
		if len(vs) == 0:
			continue
		v = vs[0] + '-' + vs[1]
		n = vs[2] + ' ' + vs[3]
		# normalise!
		if n in renameto.keys():
			print('[', C.yellow('ALIA'), ']', 'Treating', n, 'as', renameto[n])
Esempio n. 31
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	f = open(fn, 'r')
	lines = f.readlines()[1:-1]
	f.close()
	flines = json2lines(lines)
	plines = sorted(json2lines(o.getJSON().split('\n')))
	# bad variants
	for bad in unfoldName:
		for key in wheretolook:
			if o.get(key) == bad:
				o.json[key] = unfoldName[bad]
	# contractions
	for short in short2long:
		for key in wheretolook:
			if o.get(key) == short:
				o.json[key] = short2long[short]
			if o.get(key) == short2long[short]:
				o.json[key+'short'] = short
	# a heuristic contraction for conference names
	if o.get('type') == 'inproceedings' \
	and 'booktitleshort' not in o.json.keys() \
	and 'booktitle' in o.up().json.keys() \
	and len(o.get('booktitle')) > len(o.up().get('booktitle')):
		o.json['booktitleshort'] = o.up().get('booktitle')
	# a heuristic expansion of conference names
	# if o.get('type') == 'proceedings' \
	# and 'booktitleshort' not in o.json.keys() \
	# and 'booktitle' in o.up().json.keys() \
	# and len(o.get('booktitle')) > len(o.up().get('booktitle')):
	# 	o.json['booktitleshort'] = o.up().get('booktitle')
	# remove faulty series: journal wins
	if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'):
		del o.json['series']
	# *short legacy while no longer version present
	for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]:
		del o.json[key]
	# Springer name change
	if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys():
		if int(o.get('year')) < 2002:
			o.json['publisher'] = 'Springer-Verlag'
			o.json['publishershort'] = 'Springer'
		else:
			o.json['publisher'] = 'Springer International Publishing'
			o.json['publishershort'] = 'Springer'
	for key in wheretolook:
		if key not in o.json:
			continue
		val = o.get(key)
		# ends with a dot
		if val.endswith('.'):
			o.json[key] = o.json[key][:-1]
			continue
		# suspiciousness
		if val.find('.') > -1:
			problem = True
			for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \
				' Inc. ', 'WG2.8'):
				if val.find(ok) > -1:
					problem = False
					break
			if problem:
				report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”')
		# superfluousness
		if key+'short' in o.json.keys() and val == o.get(key+'short'):
			del o.json[key+'short']
	nlines = sorted(json2lines(o.getJSON().split('\n')))
	if flines != plines:
		return 1
	elif plines != nlines:
		f = open(fn, 'w')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0
Esempio n. 32
0
        else:
            paperPdf = ''
        paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\
         'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\
         'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\
         'author': paperAuths, 'pages': paperPages, 'venue': volVenue}
        if paperPdf:
            paperEntry['openpdf'] = paperPdf
        if paperLnk:
            paperEntry['url'] = urlstart + '#' + paperLnk
        paperFilename = lastSlash(outputdir) + '-' + paperAuths[0].split(
            ' ')[-1]
        for a in paperAuths[1:]:
            print(a)
            paperFilename += a.split(' ')[-1][0]
        if paperFilename in done:
            paperFilename += 'a'
            while paperFilename in done:
                paperFilename = paperFilename[:-1] + chr(
                    ord(paperFilename[-1]) + 1)
        # print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json')
        f = open(outputdir + '/' + paperFilename + '.json',
                 'w',
                 encoding='utf-8')
        f.write(jsonify(paperEntry))
        f.close()
        cx += 1
        done.append(paperFilename)
    print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx),
          'papers.')
Esempio n. 33
0
def report(fn1, fn2, r):
	statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
	return r
Esempio n. 34
0
		return 2
	else:
		return 0

def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r

if __name__ == "__main__":
	verbose = sys.argv[-1] == '-v'
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	aka = parseJSON(ienputdir + '/aliases.json')
	CX = sum([len(aka[a]) for a in aka])
	# self-adaptation heuristic:
	#  if a manual rule does the same as the other heuristic, it’s dumb
	for a in sorted(aka.keys()):
		if len(aka[a]) == 1 and aka[a][0] in (nodiaLatin(a), simpleLatin(a)):
			print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing was unnecessary manual work')
		elif len(aka[a]) == 2 and (aka[a] == [nodiaLatin(a), simpleLatin(a)] \
							    or aka[a] == [simpleLatin(a), nodiaLatin(a)]):
			print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing was a lot of unnecessary manual work')
		elif nodiaLatin(a) in aka[a] or simpleLatin(a) in aka[a]:
			print('[ {} ]'.format(C.blue('DUMB')), a, 'aliasing contains some unnecessary manual work')
Esempio n. 35
0
from fancy.ANSI import C
from fancy.Templates import wordlistHTML, wordHTML
from lib.AST import Sleigh, escape
from lib.JSON import parseJSON
from lib.NLP import ifApproved
from collections import Counter

ienputdir = '../json'
outputdir = '../frontend'
n2f_name = '_name2file.json'
name2file = parseJSON(n2f_name) if os.path.exists(n2f_name) else {}
sleigh = Sleigh(ienputdir + '/corpus', name2file)

if __name__ == "__main__":
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	stems = sleigh.getStems()
	tagged = []
	for k in stems.keys():
		f = open('{}/word/{}.html'.format(outputdir, k), 'w', encoding='utf-8')
		# papers are displayed in reverse chronological order
		lst = [x.getIItem() for x in \
			sorted(stems[k], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)]
		# collect other stems
		# NB: do not use the following code, slows everything down from 1 minute to 161 minutes
		# allstems = []
		# for x in stems[k]:
		# 	allstems += x.getBareStems()
Esempio n. 36
0
def checkon(fn, o):
    if not os.path.exists(fn) or os.path.isdir(fn):
        fn = fn + '.json'
    plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
    for k in o.json.keys():
        if 'type' not in o.json.keys():
            print('TERRIBLE', o.getKey())
        if (o.json['type'] == 'proceedings' and k == 'title') or\
           (o.json['type'] == 'inproceedings' and k == 'booktitle'):
            # fix numbers
            for nr in nrs.keys():
                if o.json[k].find(' ' + nr + ' ') > -1:
                    o.json[k] = o.json[k].replace(' ' + nr + ' ',
                                                  ' ' + nrs[nr] + ' ')
        if isinstance(o.json[k], str):
            # add emdashes for fancier titles
            if k in ('title', 'booktitle'):
                o.json[k] = o.json[k].replace(' - ',
                                              ' — ').replace(' -- ', ' — ')
                # Nice heuristic to run from time to time, but reports too much
                # on stuff like “eXtreme” and “jPET”
                # if o.json[k][0].islower():
                # 	print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title')))
            # normalised pages
            if k == 'pages':
                o.json[k] = o.json[k].replace('–', '-').replace('--',
                                                                '-').replace(
                                                                    '−', '-')
            # double spaces
            if o.json[k].find('  ') > -1:
                o.json[k] = o.json[k].replace('  ', ' ').strip()
            # find numeric values, turn them into proper integers
            if o.json[k].isdigit():
                o.json[k] = int(o.json[k])
                continue
            # remove confix curlies
            elif o.json[k].startswith('{') and o.json[k].endswith('}'):
                o.json[k] = o.json[k][1:-1]
            # single quotes to double quotes
            elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1:
                o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ')
            elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"):
                o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"')
            elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"):
                o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"')
            # fancify bland quotes
            elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1:
                o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ')
            elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'):
                o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”')
            elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'):
                o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“')
            # fancify LaTeX quotes
            elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1:
                o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “')
            elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"):
                o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “')
            elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'):
                o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“')
            elif o.json[k].startswith('``') and o.json[k].endswith("''"):
                o.json[k] = '“' + o.json[k][2:-2] + '”'
            # plural possessive
            elif o.json[k].find("'s") > -1:
                o.json[k] = o.json[k].replace("'s", '’s')
            elif o.json[k].find("s' ") > -1:
                o.json[k] = o.json[k].replace("s'", 's’')
            # contractions
            elif o.json[k].find("n't") > -1:
                o.json[k] = o.json[k].replace("n't", 'n’t')
            # the case of "Jr" vs "Jr."
            if k in ('author', 'editor') and o.json[k].endswith('Jr'):
                o.json[k] += '.'
            # TODO: report remaining suspicious activity
            for c in '`"\'':  # ’ is ok
                if c in o.json[k] and k not in ('author', 'editor'):
                    print('[ {} ] {}: {} is “{}”'.format(
                        C.red('LOOK'), o.getKey(), k, o.json[k]))
                    lookat.append(o.filename)
        elif isinstance(o.json[k], list):
            # inline trivial lists
            if len(o.json[k]) == 1:
                o.json[k] = o.json[k][0]
            # inline hidden trivial lists
            if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \
            and k not in ('stemmed', 'tag', 'tagged'):
                o.json[k] = o.json[k][0]
            # unless it’s 'tagged'
            if k == 'tagged' and not isinstance(o.json[k][0], list):
                o.json[k] = [o.json[k]]
            # remove DBLP disambiguation: we might later regret it
            # but the information can be always re-retrieved
            if k in ('author', 'editor'):
                nas = []
                for a in o.json[k]:
                    # double spaces
                    if a.find('  ') > -1:
                        a = a.replace('  ', ' ').strip()
                    ws = a.split(' ')
                    if ws[-1].isdigit():
                        ws = ws[:-1]
                    nas.append(' '.join(ws))
                o.json[k] = nas
                # the case of "Jr" vs "Jr."
                o.json[k] = [
                    a + '.' if a.endswith(' Jr') else a for a in o.json[k]
                ]
    nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
    if plines != nlines:
        f = open(fn, 'w', encoding='utf-8')
        f.write(o.getJSON())
        f.close()
        return 2
    else:
        return 0
Esempio n. 37
0
			y = v
			v = 'http://'+v
		else:
			y = v.replace('http://', '').replace('https://', '')
		r = '<a href="{0}">{1}</a>'.format(v, y)
	elif k == 'aka':
		ico = ''
		r = '<br/>'.join(['a.k.a.: “{}”'.format(z) for z in listify(v)])
	else:
		ico = ''
		r = '?{}?{}?'.format(k, v)
	return ico + ' ' + r + '<br/>'

if __name__ == "__main__":
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	ts = sleigh.getTags()
	tagged = []
	for key in ts.keys():
		f = open('{}/tag/{}.html'.format(outputdir, key), 'w')
		# papers are displayed in reverse chronological order
		lst = [x.getRestrictedItem(key) for x in \
			sorted(ts[key], key=lambda z: -z.json['year'] if 'year' in z.json.keys() else 0)]
		# no comprehension possible for this case
		for x in ts[key]:
			if x not in tagged:
				tagged.append(x)
		# read tag definition
Esempio n. 38
0
CONFZ = {\
 'FOSE': 'FoSE',
'MODELS': 'MoDELS'\
}

BLANK = '     '
lines = []
cur = ''
for fn in sys.argv[1:-1]:
    if cur != fn.split('-')[0]:
        if cur != '':
            print()
        name = fn.split('-')[0].upper()
        if name in CONFZ:
            name = CONFZ[name]
        print('[{}]'.format(C.green(name)), end=': ')
        cur = fn.split('-')[0]
    print("'{}".format(fn.split('-')[-1][-6:-4]), end=' ')
    f = open(fn, 'r', encoding='utf-8')
    lines += [(fn, line[:10], line[10:].strip()) for line in f.readlines()\
     if line.strip() \
     and line[:10] != '          ' \
     and not line.startswith('##########')]
    f.close()
print()

succ = fail = 0
males = set(
    line.strip()
    for line in open('../naming/male.txt', 'r', encoding='utf-8').readlines())
femes = set(line.strip() for line in open(
Esempio n. 39
0
ienputdir = '../json'
verbose = False


def report(fn1, fn2, r):
    statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
    return r


if __name__ == "__main__":
    print('{} conference renamer\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.purple('='*42)))
    if len(sys.argv) < 2:
        print('Usage:\n\t{} [<DIR>]'.format(sys.argv[0]))
        sys.exit(1)
    verbose = sys.argv[-1] == '-v'
    if sys.argv[1].startswith(ienputdir):
        path = sys.argv[1]
        name = path.replace(ienputdir + '/corpus/', '')
        namem = lastSlash(name)
    else:
        name = sys.argv[1]
        path = ienputdir + '/corpus/' + name
        namem = lastSlash(name)
    cx = {0: 0, 1: 0, 2: 0}
    if not os.path.exists(path):
Esempio n. 40
0
def report(s, r):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], simpleLatin(s)))
    return r
Esempio n. 41
0
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], simpleLatin(s)))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    # All known contributors
    cx = {0: 0, 1: 0, 2: 0}
    people = {}
    for fn in glob.glob(ienputdir + '/people/*.json'):
        p = parseJSON(fn)
        if p['name'] in people.keys():
            cx[report(C.red('duplicate') + ' ' + C.yellow(p), 1)] += 1
            continue
        people[p['name']] = p
    print('{}: {} venues, {} papers written by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(people)),
     C.purple('='*42)))
    # traverse ALL the papers!
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                if 'author' in p.json.keys():
                    for a in listify(p.json['author']):
                        if a in people.keys():
Esempio n. 42
0
    else:
        return 0


def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(C.purple('BibSLEIGH'), sys.argv[0], 'requires a limit to work.')
        sys.exit(1)
    verbose = sys.argv[-1] == '-v'
    d2r = sys.argv[1]
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    for v in sleigh.venues:
        for c in v.getConfs():
            cx[checkreport(c.filename, c)] += 1
            for p in c.papers:
                cx[checkreport(p.filename, p)] += 1
    print('{} files checked, {} ok, {} fixed, {} failed'.format(\
Esempio n. 43
0

def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


if __name__ == "__main__":
    if len(sys.argv) > 1:
        verbose = sys.argv[1] == '-v'
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    # read the CSV
    f = open('scrap-committees/scraped-by-grammarware.csv',
             'r',
             encoding='utf-8')
    # CBSE;2001;Heinz;Schmidt;;Organising Committee
    for line in f.readlines():
        vs = line.strip().split(';')
        if len(vs) == 0:
            continue
        v = vs[0] + '-' + vs[1]
        n = vs[2] + ' ' + vs[3]
        # normalise!
Esempio n. 44
0
def checkon(fn, o):
    if 'dblpkey' not in o.json.keys():
        print('[ {} ] {}'.format(C.red('DONT'),
                                 'DBLP key not found on the entry'))
        return 1
    mykey = o.get('dblpkey')
    # for the rare case of multiple dblpkeys
    # (can happen as a DBLP error or when same proceedings span over multiple volumes)
    if isinstance(mykey, list):
        mykey = mykey[0]
    if mykey not in procs.keys():
        print('[ {} ] {}'.format(C.red('DONT'),
                                 'DBLP key not found in the dump'))
        return 1
    title = procs[mykey]
    if title.endswith('.'):
        title = title[:-1]
    ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ')
    country = findOneIn(knownCountries, ws)
    state = findOneIn(usaStateNames, ws)
    found = False
    if country:
        town = ws[ws.index(country) - 1]
        state = '?'
        # what if "town" is an USA state? (full)
        if country == 'USA' and town in usaStateNames:
            state = town
            town = ws[ws.index(town) - 1]
        # what if "town" is an USA state? (abbreviated)
        if country == 'USA' and town in usaStateAB:
            state = usaStateNames[usaStateAB.index(town)]
            town = ws[ws.index(town) - 1]
        # what if "town" is a Canadian state? (full)
        if country == 'Canada' and town in canStateNames:
            state = town
            town = ws[ws.index(town) - 1]
        # what if "town" is a Canadian state? (abbreviated)
        if country == 'Canada' and town in canStateAB:
            state = canStateNames[canStateAB.index(town)]
            town = ws[ws.index(town) - 1]
        # the same can happen in the UK
        if country in ('UK', 'United Kingdom') and town in ('Scotland',
                                                            'Scottland'):
            state = town
            town = ws[ws.index(town) - 1]
        # Georgia the country vs Georgia the state
        if country == 'Georgia' and town == 'Atlanta':
            state = country
            country = 'USA'
        # near Something
        if town.startswith('near '):
            town = ws[ws.index(town) - 1]
        # Luxembourg, Luxembourg
        if country == 'Luxembourg':
            town = 'Luxembourg'
        # Saint-Malo / St. Malo
        if country == 'France' and town == 'St. Malo':
            town = 'Saint-Malo'
        # Florence / Firenze
        if country == 'Italy' and town.find('Firenze') > -1:
            town = 'Florence'
        found = True
    elif state:
        country = 'USA'
        town = ws[ws.index(state) - 1]
        found = True
    else:
        # desperate times
        for sol in desperateSolutions.keys():
            if sol in ws:
                town, state, country = desperateSolutions[sol]
                found = True
    # normalise
    if country in countryMap.keys():
        country = countryMap[country]
    if country == 'United Kingdom' and state == '?':
        if town.endswith('London') or town in ('Birmingham', 'York',\
        'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\
        'Southampton', 'Norwich', 'Leicester', 'Canterbury'):
            state = 'England'
        elif town in ('Edinburgh', 'Glasgow'):
            state = 'Scotland'
    # report
    if 'address' in o.json.keys():
        print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address')))
    if 'location' in o.json.keys():
        print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location')))
    if found:
        # print('[ {} ] {}'.format(C.blue('KNOW'), country))
        print('[ {} ] {}'.format(C.blue('AD||'), title))
        print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'),
                                                      C.yellow(town),
                                                      C.yellow(state),
                                                      C.yellow(country)))
        # TODO: perhaps later we can act more aggressively
        newaddr = [town, '' if state == '?' else state, country]
        if 'address' not in o.json.keys() or newaddr != o.json['address']:
            o.json['address'] = newaddr
            f = open(o.json['FILE'], 'w', encoding='utf-8')
            f.write(o.getJSON())
            f.close()
            return 2
        # nothing changed
        return 0
    print('[ {} ] {}'.format(C.yellow('AD??'), title))
    return 1
Esempio n. 45
0
def checkon(fn, o):
	if 'dblpkey' not in o.json.keys():
		print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found on the entry'))
		return 1
	mykey = o.get('dblpkey')
	# for the rare case of multiple dblpkeys
	# (can happen as a DBLP error or when same proceedings span over multiple volumes)
	if isinstance(mykey, list):
		mykey = mykey[0]
	if mykey not in procs.keys():
		print('[ {} ] {}'.format(C.red('DONT'), 'DBLP key not found in the dump'))
		return 1
	title = procs[mykey]
	if title.endswith('.'):
		title = title[:-1]
	ws = title.replace(' - ', ', ').replace(' (', ', ').split(', ')
	country = findOneIn(knownCountries, ws)
	state = findOneIn(usaStateNames, ws)
	found = False
	if country:
		town = ws[ws.index(country)-1]
		state = '?'
		# what if "town" is an USA state? (full)
		if country == 'USA' and town in usaStateNames:
			state = town
			town = ws[ws.index(town)-1]
		# what if "town" is an USA state? (abbreviated)
		if country == 'USA' and town in usaStateAB:
			state = usaStateNames[usaStateAB.index(town)]
			town = ws[ws.index(town)-1]
		# what if "town" is a Canadian state? (full)
		if country == 'Canada' and town in canStateNames:
			state = town
			town = ws[ws.index(town)-1]
		# what if "town" is a Canadian state? (abbreviated)
		if country == 'Canada' and town in canStateAB:
			state = canStateNames[canStateAB.index(town)]
			town = ws[ws.index(town)-1]
		# the same can happen in the UK
		if country in ('UK', 'United Kingdom') and town in ('Scotland', 'Scottland'):
			state = town
			town = ws[ws.index(town)-1]
		# Georgia the country vs Georgia the state
		if country == 'Georgia' and town == 'Atlanta':
			state = country
			country = 'USA'
		# near Something
		if town.startswith('near '):
			town = ws[ws.index(town)-1]
		# Luxembourg, Luxembourg
		if country == 'Luxembourg':
			town = 'Luxembourg'
		# Saint-Malo / St. Malo
		if country == 'France' and town == 'St. Malo':
			town = 'Saint-Malo'
		# Florence / Firenze
		if country == 'Italy' and town.find('Firenze') > -1:
			town = 'Florence'
		found = True
	elif state:
		country = 'USA'
		town = ws[ws.index(state)-1]
		found = True
	else:
		# desperate times
		for sol in desperateSolutions.keys():
			if sol in ws:
				town, state, country = desperateSolutions[sol]
				found = True
	# normalise
	if country in countryMap.keys():
		country = countryMap[country]
	if country == 'United Kingdom' and state == '?':
		if town.endswith('London') or town in ('Birmingham', 'York',\
		'Coventry', 'Nottingham', 'Lancaster', 'Oxford', 'Manchester',\
		'Southampton', 'Norwich', 'Leicester', 'Canterbury'):
			state = 'England'
		elif town in ('Edinburgh', 'Glasgow'):
			state = 'Scotland'
	# report
	if 'address' in o.json.keys():
		print('[ {} ] {}'.format(C.blue('OLDA'), o.get('address')))
	if 'location' in o.json.keys():
		print('[ {} ] {}'.format(C.blue('OLDL'), o.get('location')))
	if found:
		# print('[ {} ] {}'.format(C.blue('KNOW'), country))
		print('[ {} ] {}'.format(C.blue('AD||'), title))
		print('[ {} ] {:30} || {:30} || {:20}'.format(C.blue('AD->'), C.yellow(town), C.yellow(state), C.yellow(country)))
		# TODO: perhaps later we can act more aggressively
		newaddr = [town, '' if state=='?' else state, country]
		if 'address' not in o.json.keys() or newaddr != o.json['address']:
			o.json['address'] = newaddr
			f = open(o.json['FILE'], 'w')
			f.write(o.getJSON())
			f.close()
			return 2
		# nothing changed
		return 0
	print('[ {} ] {}'.format(C.yellow('AD??'), title))
	return 1
Esempio n. 46
0
    if n in name2file:
        return '<a href="{}">{}</a>'.format(name2file[n], shorten(n))
    else:
        return n


def pad(n):
    X = str(n)
    while len(X) < 4:
        X = '0' + X
    return X


if __name__ == "__main__":
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    ps = []
    # flatten the sleigh
    bykey = {}
    for v in sleigh.venues:
        bykey[v.getKey()] = v
        for c in v.getConfs():
            bykey[c.getKey()] = c
            for p in c.papers:
                bykey[p.getKey()] = p
    print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey))))
    # tagged = []
    # for k in ts.keys():
Esempio n. 47
0
import sys, os.path, glob
from fancy.ANSI import C

ienputdir = '../json'
verbose = False

def report(fn1, fn2, r):
	statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
	return r

if __name__ == "__main__":
	print('{} conference renamer\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.purple('='*42)))
	if len(sys.argv) < 2:
		print('Usage:\n\t{} [<DIR>]'.format(sys.argv[0]))
		sys.exit(1)
	verbose = sys.argv[-1] == '-v'
	if sys.argv[1].startswith(ienputdir):
		path = sys.argv[1]
		name = path.replace(ienputdir + '/corpus/', '')
		namem = name.split('/')[-1]
	else:
		name = sys.argv[1]
		path = ienputdir + '/corpus/' + name
		namem = name.split('/')[-1]
	cx = {0: 0, 1: 0, 2: 0}
	if not os.path.exists(path):
Esempio n. 48
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	f = open(fn, 'r', encoding='utf-8')
	lines = f.readlines()[1:-1]
	f.close()
	flines = json2lines(lines)
	plines = sorted(json2lines(o.getJSON().split('\n')))
	# bad variants
	for bad in unfoldName:
		for key in wheretolook:
			if o.get(key) == bad:
				o.json[key] = unfoldName[bad]
	# contractions
	for short in short2long:
		for key in wheretolook:
			if o.get(key) == short:
				o.json[key] = short2long[short]
			if o.get(key) == short2long[short]:
				o.json[key+'short'] = short
	# a heuristic contraction for conference names
	if o.get('type') == 'inproceedings' \
	and 'booktitleshort' not in o.json.keys() \
	and 'booktitle' in o.up().json.keys() \
	and len(o.get('booktitle')) > len(o.up().get('booktitle')):
		o.json['booktitleshort'] = o.up().get('booktitle')
	# a heuristic expansion of conference names
	# if o.get('type') == 'proceedings' \
	# and 'booktitleshort' not in o.json.keys() \
	# and 'booktitle' in o.up().json.keys() \
	# and len(o.get('booktitle')) > len(o.up().get('booktitle')):
	# 	o.json['booktitleshort'] = o.up().get('booktitle')
	# remove faulty series: journal wins
	if 'series' in o.json and 'journal' in o.json and o.get('series') == o.get('journal'):
		del o.json['series']
	# *short legacy while no longer version present
	for key in [k for k in o.json.keys() if k.endswith('short') and k[:-5] not in o.json.keys()]:
		del o.json[key]
	# Springer name change
	if o.get('publisher').find('Springer') > -1 and 'year' in o.json.keys():
		if int(o.get('year')) < 2002:
			o.json['publisher'] = 'Springer-Verlag'
			o.json['publishershort'] = 'Springer'
		else:
			o.json['publisher'] = 'Springer International Publishing'
			o.json['publishershort'] = 'Springer'
	for key in wheretolook:
		if key not in o.json:
			continue
		val = o.get(key)
		# ends with a dot
		if val.endswith('.'):
			o.json[key] = o.json[key][:-1]
			continue
		# suspiciousness
		if val.find('.') > -1:
			problem = True
			for ok in ('. Volume', 'CEUR-WS.org', 'icml.cc', 'JMLR.org', 'Vol. ', '. Part', \
				' Inc. ', 'WG2.8'):
				if val.find(ok) > -1:
					problem = False
					break
			if problem:
				report(C.yellow('LOOK'), key + ' of ' + o.getKey() + ' is “' + o.get(key) + '”')
		# superfluousness
		if key+'short' in o.json.keys() and val == o.get(key+'short'):
			del o.json[key+'short']
	nlines = sorted(json2lines(o.getJSON().split('\n')))
	if flines != plines:
		return 1
	elif plines != nlines:
		f = open(fn, 'w', encoding='utf-8')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0
Esempio n. 49
0
			paperAuths = paperAuths[:-1]
			paperAuths.extend(auths)
		paperLnk = li.get('id')
		hope = li.find_all('a')
		if hope and hope[0].get('href').endswith('.pdf'):
			paperPdf = urlstart + hope[0].get('href')
		else:
			paperPdf = ''
		paperEntry = {'type': 'inproceedings', 'series': 'CEUR Workshop Proceedings',\
			'publisher': 'CEUR-WS.org', 'year': volYear, 'booktitle': volTitles[-1],\
			'editor': volEds, 'volume': volNr.split('-')[-1], 'title': paperTitle,\
			'author': paperAuths, 'pages': paperPages, 'venue': volVenue}
		if paperPdf:
			paperEntry['openpdf'] = paperPdf
		if paperLnk:
			paperEntry['url'] = urlstart + '#' + paperLnk
		paperFilename = outputdir.split('/')[-1] + '-' + paperAuths[0].split(' ')[-1]
		for a in paperAuths[1:]:
			paperFilename += a.split(' ')[-1][0]
		if paperFilename in done:
			paperFilename += 'a'
			while paperFilename in done:
				paperFilename = paperFilename[:-1] + chr(ord(paperFilename[-1])+1)
		# print(jsonify(paperEntry), '-->', outputdir+'/'+paperFilename+'.json')
		f = open(outputdir+'/'+paperFilename+'.json', 'w')
		f.write(jsonify(paperEntry))
		f.close()
		cx += 1
		done.append(paperFilename)
	print(C.red(volVenue), '-', C.yellow(volTitles[-1]), '-', C.blue(cx), 'papers.')
Esempio n. 50
0
		return 2
	else:
		return 0

def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		print('[ {} ] {}'.format(statuses[r], fn))
	return r

if __name__ == "__main__":
	verbose = sys.argv[-1] == '-v'
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.green(len(sleigh.venues)),
		C.green(sleigh.numOfPapers()),
		C.purple('='*42)))
	aka = parseJSON(ienputdir + '/aliases.json')
	CX = sum([len(aka[a]) for a in aka])
	# self-adaptation heuristic:
	#  if a manual rule does the same as the other heuristic, it’s dumb
	for a in sorted(aka.keys()):
		if len(aka[a]) == 1 and aka[a][0] in (nodiaLatin(a), simpleLatin(a)):
			print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing was unnecessary manual work')
		elif len(aka[a]) == 2 and (aka[a] == [nodiaLatin(a), simpleLatin(a)] \
							    or aka[a] == [simpleLatin(a), nodiaLatin(a)]):
			print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing was a lot of unnecessary manual work')
		elif nodiaLatin(a) in aka[a] or simpleLatin(a) in aka[a]:
			print('[ {} ]'.format(C.blue('DUMB')), simpleLatin(a), 'aliasing contains some unnecessary manual work')
Esempio n. 51
0
def report(one, two):
	print('[ {} ] {}'.format(one, two))

def checkreport(fn, o):
	statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
	r = checkon(fn, o)
	# non-verbose mode by default
	if verbose or r != 0:
		report(statuses[r], fn)
	return r

if __name__ == "__main__":
	if len(sys.argv) > 1:
		verbose = sys.argv[1] == '-v'
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	cx = {0: 0, 1: 0, 2: 0}
	for v in sleigh.venues:
		for c in v.getConfs():
			cx[checkreport(c.filename, c)] += 1
			for p in c.papers:
				cx[checkreport(p.filename, p)] += 1
	print('{} files checked, {} ok, {} fixed, {} failed'.format(\
		C.bold(cx[0] + cx[1] + cx[2]),
		C.blue(cx[0]),
		C.yellow(cx[2]),
		C.red(cx[1])))
Esempio n. 52
0
		cx[1] += 1
		return dblpLatin(s)+':'
	ws = s.split(' ')
	i = -1
	if ws[i] in ('Jr', 'Jr.'):
		i -= 1
	sur = dblpLatin(' '.join(ws[i:]))
	rest = dblpLatin(' '.join(ws[:i])).replace(' ', '_')
	for c in ".'-":
		rest = rest.replace(c, '=')
	return sur+':'+rest

if __name__ == "__main__":
	verbose = sys.argv[-1] == '-v'
	if not os.path.exists('_renameto.json'):
		print('Run', C.blue('refine-aliases.py'), 'to build the aliasing/renaming relation and cache it.')
		sys.exit(1)
	# aka = parseJSON(ienputdir + '/aliases.json')
	dis = parseJSON(ienputdir + '/disambig.json')
	renameto = parseJSON('_renameto.json')
	# Data from the conferenceMetrics repo
	csv = []
	f = open('../conferenceMetrics/data/SE-conf-roles.csv', 'r')
	for line in f.readlines():
		# Conference;Year;First Name;Last Name;Sex;Role
		csv.append(line.strip().split(';'))
	f.close()
	f = open('scrap-committees/scraped-by-grammarware.csv', 'r')
	for line in f.readlines():
		csv.append(line.strip().split(';'))
	f.close()
Esempio n. 53
0
			pcx += len(plst)
			ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>'
		elif isinstance(evals, list) and isinstance(evals[0], str):
			plst = sorted(matchfromsleigh(sleigh, evals), key=sortbypages)
			pcx += len(plst)
			ptxt = '<dl class="toc">'+'\n'.join([p.getItem() for p in plst])+'</dl>'
		elif isinstance(evals, list) and isinstance(evals[0], dict):
			ptxt = processSortedRel(evals)
		else:
			print(C.red('ERROR:'), 'unrecornised bundle structure', evals)
		acc.append('<dl><dt>{}{}</dt><dd>{}</dl>'.format(img, ename, ptxt))
	return '\n'.join(acc)

if __name__ == "__main__":
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	bundles = {}
	for b in glob.glob(ienputdir + '/bundles/*.json'):
		purename = b.split('/')[-1][:-5]
		bun = json.load(open(b, 'r'))
		prevcx = pcx
		uberlist = '<h2>{1} papers</h2>{0}'.format(processSortedRel(bun['contents']), pcx-prevcx)
		f = open(outputdir + '/bundle/' + purename + '.html', 'w')
		f.write(bunHTML.format(\
			title=purename+' bundle',
			bundle=bun['name'],
			ebundle=escape(purename),
			dl=uberlist.replace('href="', 'href="../').replace('../mailto', 'mailto')))
Esempio n. 54
0
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    if isinstance(o, int):
        r = o
    else:
        r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    peoplez = glob.glob(ienputdir + '/people/*.json')
    print('{}: {} venues, {} papers by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(peoplez)),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                cx[checkreport(p.filename, p)] += 1
    print('{} files checked, {} ok, {} fixed, {} failed'.format(\
     C.bold(cx[0] + cx[1] + cx[2]),
     C.blue(cx[0]),
     C.yellow(cx[2]),
     C.red(cx[1])))
    print(C.red('{} files to check manually!'.format(len(warnings))))
Esempio n. 55
0
        return 0


def checkreport(fn, o):
    statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
    r = checkon(fn, o)
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    print('{}: {} venues, {} papers\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    for v in sleigh.venues:
        # tags per venue
        for c in v.getConfs():
            cx[checkreport(c.filename, c)] += 1
        for b in v.brands:
            cx[checkreport(b.filename, b)] += 1
        cx[checkreport(v.filename, v)] += 1
    print('{} files checked, {} ok, {} fixed, {} failed'.format(\
     C.bold(cx[0] + cx[1] + cx[2]),
     C.blue(cx[0]),
     C.yellow(cx[2]),
Esempio n. 56
0
def linkto(n):
	if n in name2file:
		return '<a href="{}">{}</a>'.format(name2file[n], shorten(n))
	else:
		return n

def pad(n):
	X = str(n)
	while len(X) < 4:
		X = '0' + X
	return X

if __name__ == "__main__":
	print('{}: {} venues, {} papers\n{}'.format(\
		C.purple('BibSLEIGH'),
		C.red(len(sleigh.venues)),
		C.red(sleigh.numOfPapers()),
		C.purple('='*42)))
	ps = []
	# flatten the sleigh
	bykey = {}
	for v in sleigh.venues:
		bykey[v.getKey()] = v
		for c in v.getConfs():
			bykey[c.getKey()] = c
			for p in c.papers:
				bykey[p.getKey()] = p
	print(C.purple('BibSLEIGH flattened to {} entries'.format(len(bykey))))
	# tagged = []
	# for k in ts.keys():
Esempio n. 57
0
        print('[ {} ] {}'.format(statuses[r], fn))
    return r


def two(n):
    if n < 10:
        return '0{}'.format(n)
    else:
        return '{}'.format(n)


if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    peoplez = glob.glob(ienputdir + '/people/*.json')
    print('{}: {} venues, {} papers by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(peoplez)),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    # stem ALL the papers!
    for v in sleigh.venues:
        for c in v.getConfs():
            for p in c.papers:
                cx[checkreport(p.filename, p, None)] += 1
        for b in v.getBrands():
            cx[checkreport(b.filename, None, b)] += 1
    # write all stems
    listOfStems = sorted(filter(ifApproved, ALLSTEMS),
                         key=lambda w: two(len(w)) + w)
Esempio n. 58
0
def report(fn1, fn2, r):
    statuses = (C.blue(' PASS '), C.red(' FAIL '), C.yellow('RENAME'))
    # non-verbose mode by default
    if verbose or r != 0:
        print('[ {} ] {} → {}'.format(statuses[r], fn1, fn2))
    return r
Esempio n. 59
0

def sdistance(x1, x2):
    return str(distance(x1, x2)).replace('.', ',')


def distance(x1, x2):
    return sqrt(sum([(x1[jj] - x2[jj])**2 for jj in range(0, len(x1))]))


# NB: some clustering/visualisation code based on http://brandonrose.org/clustering
if __name__ == "__main__":
    verbose = sys.argv[-1] == '-v'
    peoplez = glob.glob(ienputdir + '/people/*.json')
    print('{}: {} venues, {} papers by {} people\n{}'.format(\
     C.purple('BibSLEIGH'),
     C.red(len(sleigh.venues)),
     C.red(sleigh.numOfPapers()),
     C.red(len(peoplez)),
     C.purple('='*42)))
    cx = {0: 0, 1: 0, 2: 0}
    # we need to know all the words we have
    UberDict = set()
    UberCols = set()
    vocs = {b.getKey():b.json['vocabulary'] \
     for v in sleigh.venues \
     for b in v.getBrands() \
     if 'vocabulary' in b.json \
     if len(b.json['vocabulary']) > 10}
    for vkey in vocs:
        UberDict.update(vocs[vkey].keys())
Esempio n. 60
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	for k in o.json.keys():
		if 'type' not in o.json.keys():
			print('TERRIBLE',o.getKey())
		if (o.json['type'] == 'proceedings' and k == 'title') or\
		   (o.json['type'] == 'inproceedings' and k == 'booktitle'):
			# fix numbers
			for nr in nrs.keys():
				if o.json[k].find(' '+nr+' ') > -1:
					o.json[k] = o.json[k].replace(' '+nr+' ', ' '+nrs[nr]+' ')
		if isinstance(o.json[k], str):
			# add emdashes for fancier titles
			if k in ('title', 'booktitle'):
				o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ')
				# Nice heuristic to run from time to time, but reports too much
				# on stuff like “eXtreme” and “jPET”
				# if o.json[k][0].islower():
				# 	print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title')))
			# normalised pages
			if k == 'pages':
				o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace('−', '-')
			# double spaces
			if o.json[k].find('  ') > -1:
				o.json[k] = o.json[k].replace('  ', ' ').strip()
			# find numeric values, turn them into proper integers
			if o.json[k].isdigit():
				o.json[k] = int(o.json[k])
				continue
			# remove confix curlies
			elif o.json[k].startswith('{') and o.json[k].endswith('}'):
				o.json[k] = o.json[k][1:-1]
			# single quotes to double quotes
			elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1:
				o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ')
			elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"):
				o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"')
			elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"):
				o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"')
			# fancify bland quotes
			elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1:
				o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ')
			elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'):
				o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”')
			elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'):
				o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“')
			# fancify LaTeX quotes
			elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1:
				o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “')
			elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"):
				o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “')
			elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'):
				o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“')
			elif o.json[k].startswith('``') and o.json[k].endswith("''"):
				o.json[k] = '“' + o.json[k][2:-2] + '”'
			# plural possessive
			elif o.json[k].find("'s") > -1:
				o.json[k] = o.json[k].replace("'s", '’s')
			elif o.json[k].find("s' ") > -1:
				o.json[k] = o.json[k].replace("s'", 's’')
			# contractions
			elif o.json[k].find("n't") > -1:
				o.json[k] = o.json[k].replace("n't", 'n’t')
			# the case of "Jr" vs "Jr."
			if k in ('author', 'editor') and o.json[k].endswith('Jr'):
				o.json[k] += '.'
			# TODO: report remaining suspicious activity
			for c in '`"\'': # ’ is ok
				if c in o.json[k] and k not in ('author', 'editor'):
					print('[ {} ] {}: {} is “{}”'.format(C.red('LOOK'), o.getKey(), k, o.json[k]))
					lookat.append(o.filename)
		elif isinstance(o.json[k], list):
			# inline trivial lists
			if len(o.json[k]) == 1:
				o.json[k] = o.json[k][0]
			# inline hidden trivial lists
			if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \
			and k not in ('stemmed', 'tag', 'tagged'):
				o.json[k] = o.json[k][0]
			# unless it’s 'tagged'
			if k == 'tagged' and not isinstance(o.json[k][0], list):
				o.json[k] = [o.json[k]]
			# remove DBLP disambiguation: we might later regret it
			# but the information can be always re-retrieved
			if k in ('author', 'editor'):
				nas = []
				for a in o.json[k]:
					# double spaces
					if a.find('  ') > -1:
						a = a.replace('  ', ' ').strip()
					ws = a.split(' ')
					if ws[-1].isdigit():
						ws = ws[:-1]
					nas.append(' '.join(ws))
				o.json[k] = nas
				# the case of "Jr" vs "Jr."
				o.json[k] = [a+'.' if a.endswith(' Jr') else a for a in o.json[k]]
	nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]])
	if plines != nlines:
		f = open(fn, 'w')
		f.write(o.getJSON())
		f.close()
		return 2
	else:
		return 0