allpaths.add(line)

missing = set()
found = set()
mapping = dict()
path2id = dict()

#things we already have:

already = pd.read_csv('ids2pathlist.tsv', sep='\t')
already = set(already.docid)

for anid in ids:
    if anid in already:
        continue
    path, postfix = utils.pairtreepath(anid, '')
    totalpath = path + postfix + '/' + utils.clean_pairtree(anid) + '.json.bz2'
    if totalpath not in allpaths:
        newid = anid.replace('uc1.b', 'uc1.$b')
        path, postfix = utils.pairtreepath(newid, '')
        totalpath = path + postfix + '/' + utils.clean_pairtree(
            newid) + '.json.bz2'
        if totalpath in allpaths:
            mapping[anid] = newid
            found.add(totalpath)
            path2id[totalpath] = anid
        else:
            missing.add(anid)
    else:
        found.add(totalpath)
        path2id[totalpath] = anid
Ejemplo n.º 2
0
def get_pairtree(pairtreeroot, htid):

    path, postfix = utils.pairtreepath(htid, pairtreeroot)
    wholepath = path + postfix + '/' + postfix + '.json.bz2'

    return wholepath
Ejemplo n.º 3
0
outmeta = outmeta.assign(author = outmeta.author.map(lower_and_trim))

outmeta.to_csv('mallet80metadata4experiment.tsv', sep = '\t', index_label = 'docid')

# make paths to these volumes

import SonicScrewdriver as utils
import os

missing = set()
idmapper = dict()

for anid in allselected:

    path, postfix = utils.pairtreepath(anid, '/Volumes/TARDIS/work/ef/fic/')
    totalpath = path + postfix + '/' + utils.clean_pairtree(anid) + '.json.bz2'

    if not os.path.isfile(totalpath):
        if '$' in anid:
            newid = anid.replace('uc1.b', 'uc1.$b')
        else:
            newid = anid.replace('uc1.$b', 'uc1.b')

        path, postfix = utils.pairtreepath(newid, '/Volumes/TARDIS/work/ef/fic/')
        totalpath = path + postfix + '/' + utils.clean_pairtree(newid) + '.json.bz2'
        if os.path.isfile(totalpath):
            idmapper[anid] = totalpath
        else:
            missing.add(anid)
    else:
Ejemplo n.º 4
0
    metasource = pd.read_csv(args[1], sep='\t')

    missing = 0

    docstoprocess = metasource.docid

    for idx, docid in enumerate(docstoprocess):

        if idx % 100 == 1:
            print(idx)

        if docid in translations:
            docid = translations[docid]

        path, postfix = utils.pairtreepath(docid, '')
        inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
            docid) + '.json.bz2'

        if os.path.isfile(inpath):
            pass
        elif 'uc1.b' in docid:
            newdoc = docid.replace('uc1.b', 'uc1.$b')
            path, postfix = utils.pairtreepath(newdoc, '')
            inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
                newdoc) + '.json.bz2'
            if os.path.isfile(inpath):
                translations[docid] = newdoc
            else:
                missing += 1
                print(missing, inpath, 'not found.')
Ejemplo n.º 5
0
outrows = []
missing = 0
themissing = []

for d in meta.docid:
    cleand = utils.clean_pairtree(d)
    dollarless = d.replace('$', '')

    if d in pathdict:
        outrows.append((d, pathdict[d]))
    elif cleand in pathdict:
        outrows.append((cleand, pathdict[cleand]))
    elif dollarless in pathdict:
        outrows.append((dollarless, pathdict[dollarless]))
    else:
        possiblepath, postfix = utils.pairtreepath(d, '')
        thepathtotest = '/Volumes/TARDIS/work/ef/fic/' + possiblepath + postfix + '/' + d + '.json.bz2'
        thepath = possiblepath + postfix + '/' + d + '.json.bz2'
        if os.path.isfile(thepathtotest):
            outrows.append((d, thepath))
            print('worked')
        else:
            thepathtotest = thepathtotest.replace('uc1.b', 'uc1.$b')
            thepath = thepath.replace('uc1.b', 'uc1.$b')
            if os.path.isfile(thepathtotest):
                outrows.append((d, thepath))
                print('worked')
            else:
                print('failed')
                missing += 1
                themissing.append(d)