Beispiel #1
0
def makemetadata(code,datadir,outfile1,outfile2,depends_on = (resource_root + 'ProcessedManifest_2_HandAdditions.tsv',resource_root + 'Keywords.txt')):

    Z  = {}

    keyword_file = depends_on[1]
    Y = tb.tabarray(SVfile = keyword_file)[['Code','Keywords']]
    y = Y[Y['Code'] == code]
    Z['keywords'] = [x.strip() for x in str(y['Keywords'][0]).split(',')]
    
    
    dirl = np.array(listdir(datadir))
    
    pr = lambda x : x.split('!')[-1][:-4]
    p=re.compile('\([^\)]*\)')
    
    tps = [l for l in dirl if l.endswith('.txt.txt')]
    if tps:
        textpath = datadir + tps[0]
        [SD,things] = ParseTexts(textpath,code)
        FNs = [p.sub('',things[pr(y).lower()]).replace(' ,',',').replace(',,',',') if pr(y).lower() in things.keys() else '' for y in dirl]
        FNs = [z.split('=')[1] if '=' in z and not ' =' in z else z for z in FNs]
    else:
        SD = ''
        FNs = len(dirl)*['']
        
    Z['description'] = SD
    
    cfs = [l for l in dirl if l.endswith('.contacts.txt')]
    if cfs:
        contactfile = datadir + cfs[0]
        ctext = open(contactfile,'rU').read().strip()
        if '<html>' in ctext.lower():
            clines = ctext.split('\n')
            fb = [i for i in range(len(clines)) if clines[i].strip() == ''][0]
            ctext = '\n'.join(clines[fb+1:])
        ctext = ctext.strip(' *\n').replace('\n\n','\n')    
    else:
        ctext = ''
        
    Z['contactInfo'] = ctext
    f = open(outfile1,'w')
    pickle.dump(Z,f)
    f.close()

    Y = tb.tabarray(SVfile = depends_on[0])
    Y.sort(order = ['File'])

    
    dirlp = np.array([pr(y) for y in dirl])
    [A,B] = tb.fast.equalspairs(dirlp,Y['File'])
    if (B>A).any():
        print 'adding hand-made content to', dirlp[B>A]
        for k in (B>A).nonzero()[0]:
            FNs[k] = Y['FileName'][A[k]]    
    
    D = tb.tabarray(columns=[dirl,FNs], names = ['Path','FileName'])
    
    D.saveSV(outfile2,metadata = True)  
Beispiel #2
0
def OGRInspectorInstantiator(depends_on=root, creates=protocol_root + "OGRInspectors.py"):
    L = [l for l in listdir(depends_on) if IsDir(depends_on + l)]

    outdir = "../Data/ShapeFileOGRInspections/"

    D = [("initialize", MakeDir, (outdir,))]
    D += [("Inspect_" + l, ogrinspectdir, (depends_on + l, l, outdir + l + ".py")) for l in L]

    actualize(creates, D)
Beispiel #3
0
def processtextfile(datadir,outfile):
    dirl = listdir(datadir)
    tps = [l for l in dirl if l.endswith('.txt.txt')]
    if tps:
        textpath = datadir + tps[0]
        strongcopy(textpath,outfile)

    else:
        F = open(outfile,'w')
        F.write('No documentation file found in.')
        F.close()
Beispiel #4
0
def identifybasepath(base,datadir):
    L = listdir(datadir)
    L1 = [x.split('.')[-2] for x in L]
    L2 = [x.split('.')[-2].replace('_','') for x in L]
    if base in L1:
        return datadir + L[L1.index(base)]
    elif base in L2:
        return datadir + L[L2.index(base)]
    elif base.replace('_','') in L1:
        return datadir + L[L1.index(base.replace('_',''))]
    elif base.replace('_','') in L2:
        return datadir + L[L2.index(base.replace('_',''))]
Beispiel #5
0
def bls_downloader(download_dir,code):

    MakeDirs(download_dir)
    download_dir += ('/' if download_dir[-1] != '/' else '')

    MakeDir(download_dir + 'RawDownloads/')
    
    get = "ftp://ftp.bls.gov/pub/time.series/" + code + '/'
    
    WgetMultiple(get,download_dir + 'RawDownloads/index.html')
    Soup = BeautifulSoup(open(download_dir + 'RawDownloads/index.html'))
    A = Soup.findAll('a')
    Records = [(Contents(a),str(dict(a.attrs)['href'])) for a in A]
    Records = [r for r in Records if 'Current' not in r[0].split('.')]
    RecordsR = [r for r in Records if 'AllData' in r[0]]
    if RecordsR:
        Records = RecordsR + [r for r in Records if not '.data.' in r[0]]
    T = tb.tabarray(records = Records,names = ['File','URL'])
    for (f,u) in T:
        wget(u,download_dir + 'RawDownloads/' + f + '.txt')

    makemetadata(code,download_dir + 'RawDownloads/',download_dir + 'metadata.pickle',download_dir + 'filenames.tsv')
    
    MakeDir(download_dir + '__FILES__')
    
    processtextfile(download_dir + 'RawDownloads/',download_dir + '/__FILES__/documentation.txt')
    
    MakeDir(download_dir + '__PARSE__')
    for l in listdir(download_dir  + 'RawDownloads/'):
        if '.data.' in l:
            Rename(download_dir + 'RawDownloads/' + l, download_dir + '__PARSE__/' + l)

    SPs = [download_dir + 'RawDownloads/' + l for l in listdir(download_dir + 'RawDownloads/') if l.endswith('.series.txt')]
    assert len(SPs) == 1, 'Wrong number of series paths.'
    serpath = SPs[0]    
    parse_series(download_dir + 'RawDownloads/',download_dir + 'series.txt')
    delete(serpath)
def generate_metadata(objname,forced=False,use=100):

    metapath = opmetadatapath(objname) if IsDotPath(objname) else metadatapath(objname)
    if IsDotPath(objname):
        path = '../' + '/'.join(objname.split('.')[:-1]) + '.py'
        objectname = objname.split('.')[-1]
    else:
        path = objname
        objectname =''
        
    if forced or not PathExists(metapath) or os.path.getmtime(metapath) <= FindMtime(path,objectname=objectname,Simple=False):
        if IsDir(objname):
            if objname[-1] != '/': objname += '/'
            if is_hsv_dir(objname):
                pass
            else:
                D = {}
                L = [objname + ll for ll in listdir(objname) if not ll.startswith('.')]
                for l in L:
                    D.update(generate_metadata(l,forced=forced))
                LL = set(L).intersection(D.keys())
                D[objname] = IntegrateDirMetaData([D[l] for l in LL])
                return D
    
        else:
            if IsPythonFile(objname) or IsDotPath(objname):
                d = StoredDocstring(objname)
                if d:
                    return {objname:{'description':d,'signature': 'python'}}        
        
            elif objname.endswith(('.csv','.tsv')):
                if IsFile(objname):
                    try:
                        x = tabularmetadata(objname,use=use)
                    except:
                        x = DEFAULT_GenerateAutomaticMetaData(objname)
                        print 'Failed to tabular metadata for', objname
                        print_exc()
                    else:
                        x['signature'] = 'tabular'
                    return {objname : x}
    
                        
        return {}
    else:
        try:
            return {objname:pickle.load(open(metapath+'/AutomaticMetaData.pickle','r'))}
        except:
            return generate_metadata(objname,forced=True)
Beispiel #7
0
def parse_series(datadir,outpath,units=''):
    SPs = [datadir + l for l in listdir(datadir) if l.endswith('.series.txt')]
    assert len(SPs) == 1, 'Wrong number of series paths.'
    serpath = SPs[0]
    F = open(serpath,'rU')
    names = F.readline().rstrip('\n').split('\t')

    codefiles = {}
    codenames = {}
    bases = {}
    for name in names:
        base = identifybase(name)
        basepath = identifybasepath(base,datadir)
        if basepath != None:
            print name, basepath
            codefile = CleanOpen(basepath)
            if codefile != None:
                codename = identifycode(base,codefile.dtype.names)
                if codename != None:        
                    codenames[name] = codename
                    codefiles[name] = codefile[[n for n in codefile.dtype.names if n.startswith(base)]]
                    bases[name] = base
                else:
                    print '\n\nWARNING: Problem with code for' , name , 'in file', basepath, '\n\n'
            else:
                print '\n\nWARNING: Can\'t seem to open', basepath
        else:
            print '\n\nWARNING: Problem with finding basepath for ', name , 'in', datadir                   
    
    blocksize = 750000


    done = False

    while not done:
        lines = [F.readline().rstrip('\n').split('\t') for i in range(blocksize)]
        lines = [l for l in lines if l != ['']]
        if len(lines) > 0:
            X = tb.tabarray(records = lines,names = names)
            NewCols = []
            NewNames = []
            for name in names:
                if name in codenames.keys():
                    codefile = codefiles[name]
                    base = bases[name]
                    codename = codenames[name]
                    Xn = np.array([xx.strip() for xx in X[name]])
                    Cn = np.array([xx.strip() for xx in codefile[codename]])
                    [S1,S2] = tb.fast.equalspairs(Xn,Cn)
        
                    NewCols += [codefile[n][S1] for n in codefile.dtype.names if n != codename]
                    NewNames += [n for n in codefile.dtype.names if n != codename]
            X = X.addcols(NewCols,  names = NewNames)
            X.coloring['NewNames'] = NewNames
            
            if units != '':
                if ' ' not in units:
                    if units:
                        X.coloring['Units'] = [units]
                elif not units.startswith('if '):
                    X = X.addcols([[units]*len(X)], names=['Units'])
                else:
                    X = X.addcols([[rec['earn_text'] if rec['tdata_text'] == 'Person counts (number in thousands)' else rec['pcts_text'] for rec in X]], names=['Units'])

            tb.io.appendSV(outpath,X,metadata=True)
        else:
            done = True