allpaths.add(line) missing = set() found = set() mapping = dict() path2id = dict() #things we already have: already = pd.read_csv('ids2pathlist.tsv', sep='\t') already = set(already.docid) for anid in ids: if anid in already: continue path, postfix = utils.pairtreepath(anid, '') totalpath = path + postfix + '/' + utils.clean_pairtree(anid) + '.json.bz2' if totalpath not in allpaths: newid = anid.replace('uc1.b', 'uc1.$b') path, postfix = utils.pairtreepath(newid, '') totalpath = path + postfix + '/' + utils.clean_pairtree( newid) + '.json.bz2' if totalpath in allpaths: mapping[anid] = newid found.add(totalpath) path2id[totalpath] = anid else: missing.add(anid) else: found.add(totalpath) path2id[totalpath] = anid
def get_pairtree(pairtreeroot, htid): path, postfix = utils.pairtreepath(htid, pairtreeroot) wholepath = path + postfix + '/' + postfix + '.json.bz2' return wholepath
outmeta = outmeta.assign(author = outmeta.author.map(lower_and_trim)) outmeta.to_csv('mallet80metadata4experiment.tsv', sep = '\t', index_label = 'docid') # make paths to these volumes import SonicScrewdriver as utils import os missing = set() idmapper = dict() for anid in allselected: path, postfix = utils.pairtreepath(anid, '/Volumes/TARDIS/work/ef/fic/') totalpath = path + postfix + '/' + utils.clean_pairtree(anid) + '.json.bz2' if not os.path.isfile(totalpath): if '$' in anid: newid = anid.replace('uc1.b', 'uc1.$b') else: newid = anid.replace('uc1.$b', 'uc1.b') path, postfix = utils.pairtreepath(newid, '/Volumes/TARDIS/work/ef/fic/') totalpath = path + postfix + '/' + utils.clean_pairtree(newid) + '.json.bz2' if os.path.isfile(totalpath): idmapper[anid] = totalpath else: missing.add(anid) else:
metasource = pd.read_csv(args[1], sep='\t') missing = 0 docstoprocess = metasource.docid for idx, docid in enumerate(docstoprocess): if idx % 100 == 1: print(idx) if docid in translations: docid = translations[docid] path, postfix = utils.pairtreepath(docid, '') inpath = rootpath + path + postfix + '/' + utils.clean_pairtree( docid) + '.json.bz2' if os.path.isfile(inpath): pass elif 'uc1.b' in docid: newdoc = docid.replace('uc1.b', 'uc1.$b') path, postfix = utils.pairtreepath(newdoc, '') inpath = rootpath + path + postfix + '/' + utils.clean_pairtree( newdoc) + '.json.bz2' if os.path.isfile(inpath): translations[docid] = newdoc else: missing += 1 print(missing, inpath, 'not found.')
outrows = [] missing = 0 themissing = [] for d in meta.docid: cleand = utils.clean_pairtree(d) dollarless = d.replace('$', '') if d in pathdict: outrows.append((d, pathdict[d])) elif cleand in pathdict: outrows.append((cleand, pathdict[cleand])) elif dollarless in pathdict: outrows.append((dollarless, pathdict[dollarless])) else: possiblepath, postfix = utils.pairtreepath(d, '') thepathtotest = '/Volumes/TARDIS/work/ef/fic/' + possiblepath + postfix + '/' + d + '.json.bz2' thepath = possiblepath + postfix + '/' + d + '.json.bz2' if os.path.isfile(thepathtotest): outrows.append((d, thepath)) print('worked') else: thepathtotest = thepathtotest.replace('uc1.b', 'uc1.$b') thepath = thepath.replace('uc1.b', 'uc1.$b') if os.path.isfile(thepathtotest): outrows.append((d, thepath)) print('worked') else: print('failed') missing += 1 themissing.append(d)