def checkon(fn, o): global k2r, v2o if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if o.filename.startswith(d2r): # if we find a proceedings volume, take its title to all descendant booktitles if o.json['type'] == 'proceedings': k2r = 'booktitle' v2o = o.json['title'] return 0 # just in case if o.json['type'] not in ('inproceedings', 'article'): return 0 else: return 0 # f = open(fn, 'r', encoding='utf-8') # lines = f.readlines()[1:-1] # f.close() # flines = [strictstrip(s) for s in lines] plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if k2r in o.json.keys(): o.json[k2r] = v2o else: return 0 nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if o.get('type') not in ('proceedings', 'book'): # we don't go per paper return 0 if o.getKey() not in roles.keys(): # you know nothing, scraped CSV return 0 plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) # if 'roles' not in o.json.keys(): if True: # no prior knowledge of roles => we know everything o.json['roles'] = sorted(roles[o.getKey()], key=lambda x: x[1]) else: # prior knowledge of roles => treat per case o.json['roles'] = sorted(o.json['roles'] + [r for r in roles[o.getKey()] if r not in o.json['roles']], key=lambda x: x[1]) nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]], key=lambda x: x[1]) # The next case should not happen, but could if we have trivial lists # if flines != plines: # return 1 if plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if d2r and (not o.filename.startswith(d2r) or o.filename == d2r): return 0 # f = open(fn, 'r') # lines = f.readlines()[1:-1] # f.close() # flines = [strictstrip(s) for s in lines] plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if k2r in o.json.keys(): if o.json[k2r] == v2i: o.json[k2r] = v2o else: return 0 else: if not v2i: o.json[k2r] = v2o else: return 1 nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if d2r and (not o.filename.startswith(d2r) or o.filename == d2r): return 0 # f = open(fn, 'r', encoding='utf-8') # lines = f.readlines()[1:-1] # f.close() # flines = [strictstrip(s) for s in lines] plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if k2r in o.json.keys(): if o.json[k2r] == v2i: o.json[k2r] = v2o else: return 0 else: if not v2i: o.json[k2r] = v2o else: return 1 nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') flines = f.readlines()[1:-1] f.close() sflines = [strictstrip(s) for s in flines] sjlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) jlines = ['\t{},\n'.format(s) for s in sjlines] jlines[-1] = jlines[-1][:-2] + '\n' # remove the last comma if sflines != sjlines: return 1 elif jlines != flines: # f1 = [s for s in jlines if s not in flines] # f2 = [s for s in flines if s not in jlines] # print('∆:', f1, '\nvs', f2) f = open(fn, 'w', encoding='utf-8') f.write('{\n') for line in jlines: f.write(line) f.write('}') f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w', encoding='utf-8') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=lastSlash(fn)[:-5].replace('-', ' '), year=findYear(lastSlash(fn))\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r') flines = f.readlines()[1:-1] f.close() sflines = [strictstrip(s) for s in flines] sjlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) jlines = ['\t{},\n'.format(s) for s in sjlines] jlines[-1] = jlines[-1][:-2] + '\n' # remove the last comma if sflines != sjlines: return 1 elif jlines != flines: # f1 = [s for s in jlines if s not in flines] # f2 = [s for s in flines if s not in jlines] # print('∆:', f1, '\nvs', f2) f = open(fn, 'w') f.write('{\n') for line in jlines: f.write(line) f.write('}') f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if not os.path.exists(fn): # if it still does not exist, let us create a minimal one f = open(fn, 'w') f.write('{{\n\t"title": "{name}",\n\t"type": "proceedings",\n\t"year": {year}\n}}'.format(\ name=fn.split('/')[-1][:-5].replace('-', ' '), year=findYear(fn.split('/')[-1])\ )) f.close() print('[ {} ] {}'.format(C.yellow('MADE'), fn)) return 2 f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() for line in lines: if line.find('"year"') > -1 and findYear(line) > 3000: os.remove(fn) print('[ {} ] {}'.format(C.red('KILL'), fn)) return 1 flines = sorted([strictstrip(s) for s in lines]) plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: f1 = [line for line in flines if line not in plines] f2 = [line for line in plines if line not in flines] print('∆:', f1, '\nvs', f2) if flines == plines: return 0 else: return 1
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' if o.get('type') not in ('proceedings', 'book'): # we don't go per paper return 0 if o.getKey() not in roles.keys(): # you know nothing, scraped CSV return 0 plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) # if 'roles' not in o.json.keys(): if True: # no prior knowledge of roles => we know everything o.json['roles'] = sorted(roles[o.getKey()], key=lambda x: x[1]) else: # prior knowledge of roles => treat per case o.json['roles'] = sorted( o.json['roles'] + [r for r in roles[o.getKey()] if r not in o.json['roles']], key=lambda x: x[1]) nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]], key=lambda x: x[1]) # The next case should not happen, but could if we have trivial lists # if flines != plines: # return 1 if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
def json2lines(j): res = [] for line in j: line = strictstrip(line) if line in ('', '{', '}'): continue if line.startswith('"'): res.append(line) else: res[-1] += line return res
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for ae in ('author', 'editor'): if ae in o.json.keys(): if isinstance(o.json[ae], str): if o.json[ae] in renameto.keys(): o.json[ae] = renameto[o.json[ae]] else: for i, x in enumerate(o.json[ae]): if x in renameto.keys(): o.json[ae][i] = renameto[x] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: ff = open(fn, 'w', encoding='utf-8') ff.write(o.getJSON()) ff.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for ae in ('author', 'editor'): if ae in o.json.keys(): if isinstance(o.json[ae], str): if o.json[ae] in renameto.keys(): o.json[ae] = renameto[o.json[ae]] else: for i, x in enumerate(o.json[ae]): if x in renameto.keys(): o.json[ae][i] = renameto[x] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: ff = open(fn, 'w') ff.write(o.getJSON()) ff.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for k in o.json.keys(): if 'type' not in o.json.keys(): print('TERRIBLE', o.getKey()) if (o.json['type'] == 'proceedings' and k == 'title') or\ (o.json['type'] == 'inproceedings' and k == 'booktitle'): # fix numbers for nr in nrs.keys(): if o.json[k].find(' ' + nr + ' ') > -1: o.json[k] = o.json[k].replace(' ' + nr + ' ', ' ' + nrs[nr] + ' ') if isinstance(o.json[k], str): # add emdashes for fancier titles if k in ('title', 'booktitle'): o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ') # Nice heuristic to run from time to time, but reports too much # on stuff like “eXtreme” and “jPET” # if o.json[k][0].islower(): # print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title'))) # normalised pages if k == 'pages': o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace( '−', '-') # double spaces if o.json[k].find(' ') > -1: o.json[k] = o.json[k].replace(' ', ' ').strip() # find numeric values, turn them into proper integers if o.json[k].isdigit(): o.json[k] = int(o.json[k]) continue # remove confix curlies elif o.json[k].startswith('{') and o.json[k].endswith('}'): o.json[k] = o.json[k][1:-1] # single quotes to double quotes elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1: o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ') elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"): o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"') elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"): o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"') # fancify bland quotes elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1: o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ') elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'): o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”') elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'): o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“') # fancify LaTeX quotes elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1: o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “') elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"): o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “') elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'): o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“') elif o.json[k].startswith('``') and o.json[k].endswith("''"): o.json[k] = '“' + o.json[k][2:-2] + '”' # plural possessive elif o.json[k].find("'s") > -1: o.json[k] = o.json[k].replace("'s", '’s') elif o.json[k].find("s' ") > -1: o.json[k] = o.json[k].replace("s'", 's’') # contractions elif o.json[k].find("n't") > -1: o.json[k] = o.json[k].replace("n't", 'n’t') # the case of "Jr" vs "Jr." if k in ('author', 'editor') and o.json[k].endswith('Jr'): o.json[k] += '.' # TODO: report remaining suspicious activity for c in '`"\'': # ’ is ok if c in o.json[k] and k not in ('author', 'editor'): print('[ {} ] {}: {} is “{}”'.format( C.red('LOOK'), o.getKey(), k, o.json[k])) lookat.append(o.filename) elif isinstance(o.json[k], list): # inline trivial lists if len(o.json[k]) == 1: o.json[k] = o.json[k][0] # inline hidden trivial lists if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \ and k not in ('stemmed', 'tag', 'tagged'): o.json[k] = o.json[k][0] # unless it’s 'tagged' if k == 'tagged' and not isinstance(o.json[k][0], list): o.json[k] = [o.json[k]] # remove DBLP disambiguation: we might later regret it # but the information can be always re-retrieved if k in ('author', 'editor'): nas = [] for a in o.json[k]: # double spaces if a.find(' ') > -1: a = a.replace(' ', ' ').strip() ws = a.split(' ') if ws[-1].isdigit(): ws = ws[:-1] nas.append(' '.join(ws)) o.json[k] = nas # the case of "Jr" vs "Jr." o.json[k] = [ a + '.' if a.endswith(' Jr') else a for a in o.json[k] ] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r', encoding='utf-8') lines = f.readlines()[1:-1] f.close() flines = [strictstrip(s) for s in lines] plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: return 1 ts = [] # precise case-sensitive match mcs = o.get('title') # precise match for substrings mes = baretext(mcs) # precise match for words mew = mes.split(' ') # imprecise match for substrings mis = superbaretext(mes) # imprecise match for words miw = mis.split(' ') # now match! for t in tags: # print('Check',t,'vs',mes) if 'name' not in t.keys(): print(C.red('ERROR:'), 'no name for tag from file', t['FILE']) continue if all([not k.startswith('match') for k in t.keys()]): print(C.red('ERROR:'), 'no match rules for tag', t['name']) continue for k in t.keys(): if k == 'matchentry': if o.getKey() in t[k]: ts += [t['name']] elif k.startswith('match'): ts += [ t['name'] for s in listify(t[k]) if matchModes[k](s, mcs, mes, mew, mis, miw) ] # ts += [t['name'] for s in listify(t[k]) if fmm(t, k, s, mcs, mes, mew, mis, miw)] # second pass: check reliefs for t in tags: if 'relieves' in t.keys(): for r in listify(t['relieves']): if t['name'] in ts and r in ts: ts.remove(r) if t['name'] not in relieved.keys(): relieved[t['name']] = 0 relieved[t['name']] += 1 if ts: if not o.tags: o.tags = [] for t in ts: if t not in o.tags: o.tags.append(t) # uncomment the following one line to overwrite all tags o.tags = uniq(ts) # let’s keep tags clean and sorted o.tags = sorted(o.tags) nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w', encoding='utf-8') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if not os.path.exists(fn) or os.path.isdir(fn): fn = fn + '.json' plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) for k in o.json.keys(): if 'type' not in o.json.keys(): print('TERRIBLE',o.getKey()) if (o.json['type'] == 'proceedings' and k == 'title') or\ (o.json['type'] == 'inproceedings' and k == 'booktitle'): # fix numbers for nr in nrs.keys(): if o.json[k].find(' '+nr+' ') > -1: o.json[k] = o.json[k].replace(' '+nr+' ', ' '+nrs[nr]+' ') if isinstance(o.json[k], str): # add emdashes for fancier titles if k in ('title', 'booktitle'): o.json[k] = o.json[k].replace(' - ', ' — ').replace(' -- ', ' — ') # Nice heuristic to run from time to time, but reports too much # on stuff like “eXtreme” and “jPET” # if o.json[k][0].islower(): # print('[ {} ] {}: {} {}'.format(C.red('LOOK'), o.getKey(), 'title is', o.get('title'))) # normalised pages if k == 'pages': o.json[k] = o.json[k].replace('–', '-').replace('--', '-').replace('−', '-') # double spaces if o.json[k].find(' ') > -1: o.json[k] = o.json[k].replace(' ', ' ').strip() # find numeric values, turn them into proper integers if o.json[k].isdigit(): o.json[k] = int(o.json[k]) continue # remove confix curlies elif o.json[k].startswith('{') and o.json[k].endswith('}'): o.json[k] = o.json[k][1:-1] # single quotes to double quotes elif o.json[k].find(" '") > -1 and o.json[k].find("' ") > -1: o.json[k] = o.json[k].replace(" '", ' "').replace("' ", '" ') elif o.json[k].find(" '") > -1 and o.json[k].endswith("'"): o.json[k] = o.json[k].replace(" '", ' "').replace("'", '"') elif o.json[k].find("' ") > -1 and o.json[k].startswith("'"): o.json[k] = o.json[k].replace("' ", '" ').replace("'", '"') # fancify bland quotes elif o.json[k].find(' "') > -1 and o.json[k].find('" ') > -1: o.json[k] = o.json[k].replace(' "', ' “').replace('" ', '” ') elif o.json[k].find(' "') > -1 and o.json[k].endswith('"'): o.json[k] = o.json[k].replace(' "', ' “').replace('"', '”') elif o.json[k].find('" ') > -1 and o.json[k].startswith('"'): o.json[k] = o.json[k].replace('" ', '” ').replace('"', '“') # fancify LaTeX quotes elif o.json[k].find(' ``') > -1 and o.json[k].find("'' ") > -1: o.json[k] = o.json[k].replace("'' ", '” ').replace(' ``', ' “') elif o.json[k].find(' ``') > -1 and o.json[k].endswith("''"): o.json[k] = o.json[k].replace("''", '”').replace(' ``', ' “') elif o.json[k].find("'' ") > -1 and o.json[k].startswith('``'): o.json[k] = o.json[k].replace("'' ", '” ').replace('``', '“') elif o.json[k].startswith('``') and o.json[k].endswith("''"): o.json[k] = '“' + o.json[k][2:-2] + '”' # plural possessive elif o.json[k].find("'s") > -1: o.json[k] = o.json[k].replace("'s", '’s') elif o.json[k].find("s' ") > -1: o.json[k] = o.json[k].replace("s'", 's’') # contractions elif o.json[k].find("n't") > -1: o.json[k] = o.json[k].replace("n't", 'n’t') # the case of "Jr" vs "Jr." if k in ('author', 'editor') and o.json[k].endswith('Jr'): o.json[k] += '.' # TODO: report remaining suspicious activity for c in '`"\'': # ’ is ok if c in o.json[k] and k not in ('author', 'editor'): print('[ {} ] {}: {} is “{}”'.format(C.red('LOOK'), o.getKey(), k, o.json[k])) lookat.append(o.filename) elif isinstance(o.json[k], list): # inline trivial lists if len(o.json[k]) == 1: o.json[k] = o.json[k][0] # inline hidden trivial lists if len(o.json[k]) == 2 and o.json[k][0] == o.json[k][1] \ and k not in ('stemmed', 'tag', 'tagged'): o.json[k] = o.json[k][0] # unless it’s 'tagged' if k == 'tagged' and not isinstance(o.json[k][0], list): o.json[k] = [o.json[k]] # remove DBLP disambiguation: we might later regret it # but the information can be always re-retrieved if k in ('author', 'editor'): nas = [] for a in o.json[k]: # double spaces if a.find(' ') > -1: a = a.replace(' ', ' ').strip() ws = a.split(' ') if ws[-1].isdigit(): ws = ws[:-1] nas.append(' '.join(ws)) o.json[k] = nas # the case of "Jr" vs "Jr." o.json[k] = [a+'.' if a.endswith(' Jr') else a for a in o.json[k]] nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0
def checkon(fn, o): if os.path.isdir(fn): fn = fn + '.json' f = open(fn, 'r') lines = f.readlines()[1:-1] f.close() flines = [strictstrip(s) for s in lines] plines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if flines != plines: return 1 ts = [] # precise case-sensitive match mcs = o.get('title') # precise match for substrings mes = baretext(mcs) # precise match for words mew = mes.split(' ') # imprecise match for substrings mis = superbaretext(mes) # imprecise match for words miw = mis.split(' ') # now match! for t in tags: # print('Check',t,'vs',mes) if 'name' not in t.keys(): print(C.red('ERROR:'), 'no name for tag from file', t['FILE']) continue if all([not k.startswith('match') for k in t.keys()]): print(C.red('ERROR:'), 'no match rules for tag', t['name']) continue for k in t.keys(): if k == 'matchentry': if o.getKey() in t[k]: ts += [t['name']] elif k.startswith('match'): ts += [t['name'] for s in listify(t[k]) if matchModes[k](s, mcs, mes, mew, mis, miw)] # ts += [t['name'] for s in listify(t[k]) if fmm(t, k, s, mcs, mes, mew, mis, miw)] # second pass: check reliefs for t in tags: if 'relieves' in t.keys(): for r in listify(t['relieves']): if t['name'] in ts and r in ts: ts.remove(r) if t['name'] not in relieved.keys(): relieved[t['name']] = 0 relieved[t['name']] += 1 if ts: if not o.tags: o.tags = [] for t in ts: if t not in o.tags: o.tags.append(t) # uncomment the following one line to overwrite all tags o.tags = uniq(ts) # let’s keep tags clean and sorted o.tags = sorted(o.tags) nlines = sorted([strictstrip(s) for s in o.getJSON().split('\n')[1:-1]]) if plines != nlines: f = open(fn, 'w') f.write(o.getJSON()) f.close() return 2 else: return 0