def makemetadata(code,datadir,outfile1,outfile2,depends_on = (resource_root + 'ProcessedManifest_2_HandAdditions.tsv',resource_root + 'Keywords.txt')): Z = {} keyword_file = depends_on[1] Y = tb.tabarray(SVfile = keyword_file)[['Code','Keywords']] y = Y[Y['Code'] == code] Z['keywords'] = [x.strip() for x in str(y['Keywords'][0]).split(',')] dirl = np.array(listdir(datadir)) pr = lambda x : x.split('!')[-1][:-4] p=re.compile('\([^\)]*\)') tps = [l for l in dirl if l.endswith('.txt.txt')] if tps: textpath = datadir + tps[0] [SD,things] = ParseTexts(textpath,code) FNs = [p.sub('',things[pr(y).lower()]).replace(' ,',',').replace(',,',',') if pr(y).lower() in things.keys() else '' for y in dirl] FNs = [z.split('=')[1] if '=' in z and not ' =' in z else z for z in FNs] else: SD = '' FNs = len(dirl)*[''] Z['description'] = SD cfs = [l for l in dirl if l.endswith('.contacts.txt')] if cfs: contactfile = datadir + cfs[0] ctext = open(contactfile,'rU').read().strip() if '<html>' in ctext.lower(): clines = ctext.split('\n') fb = [i for i in range(len(clines)) if clines[i].strip() == ''][0] ctext = '\n'.join(clines[fb+1:]) ctext = ctext.strip(' *\n').replace('\n\n','\n') else: ctext = '' Z['contactInfo'] = ctext f = open(outfile1,'w') pickle.dump(Z,f) f.close() Y = tb.tabarray(SVfile = depends_on[0]) Y.sort(order = ['File']) dirlp = np.array([pr(y) for y in dirl]) [A,B] = tb.fast.equalspairs(dirlp,Y['File']) if (B>A).any(): print 'adding hand-made content to', dirlp[B>A] for k in (B>A).nonzero()[0]: FNs[k] = Y['FileName'][A[k]] D = tb.tabarray(columns=[dirl,FNs], names = ['Path','FileName']) D.saveSV(outfile2,metadata = True)
def OGRInspectorInstantiator(depends_on=root, creates=protocol_root + "OGRInspectors.py"): L = [l for l in listdir(depends_on) if IsDir(depends_on + l)] outdir = "../Data/ShapeFileOGRInspections/" D = [("initialize", MakeDir, (outdir,))] D += [("Inspect_" + l, ogrinspectdir, (depends_on + l, l, outdir + l + ".py")) for l in L] actualize(creates, D)
def processtextfile(datadir,outfile): dirl = listdir(datadir) tps = [l for l in dirl if l.endswith('.txt.txt')] if tps: textpath = datadir + tps[0] strongcopy(textpath,outfile) else: F = open(outfile,'w') F.write('No documentation file found in.') F.close()
def identifybasepath(base,datadir): L = listdir(datadir) L1 = [x.split('.')[-2] for x in L] L2 = [x.split('.')[-2].replace('_','') for x in L] if base in L1: return datadir + L[L1.index(base)] elif base in L2: return datadir + L[L2.index(base)] elif base.replace('_','') in L1: return datadir + L[L1.index(base.replace('_',''))] elif base.replace('_','') in L2: return datadir + L[L2.index(base.replace('_',''))]
def bls_downloader(download_dir,code): MakeDirs(download_dir) download_dir += ('/' if download_dir[-1] != '/' else '') MakeDir(download_dir + 'RawDownloads/') get = "ftp://ftp.bls.gov/pub/time.series/" + code + '/' WgetMultiple(get,download_dir + 'RawDownloads/index.html') Soup = BeautifulSoup(open(download_dir + 'RawDownloads/index.html')) A = Soup.findAll('a') Records = [(Contents(a),str(dict(a.attrs)['href'])) for a in A] Records = [r for r in Records if 'Current' not in r[0].split('.')] RecordsR = [r for r in Records if 'AllData' in r[0]] if RecordsR: Records = RecordsR + [r for r in Records if not '.data.' in r[0]] T = tb.tabarray(records = Records,names = ['File','URL']) for (f,u) in T: wget(u,download_dir + 'RawDownloads/' + f + '.txt') makemetadata(code,download_dir + 'RawDownloads/',download_dir + 'metadata.pickle',download_dir + 'filenames.tsv') MakeDir(download_dir + '__FILES__') processtextfile(download_dir + 'RawDownloads/',download_dir + '/__FILES__/documentation.txt') MakeDir(download_dir + '__PARSE__') for l in listdir(download_dir + 'RawDownloads/'): if '.data.' in l: Rename(download_dir + 'RawDownloads/' + l, download_dir + '__PARSE__/' + l) SPs = [download_dir + 'RawDownloads/' + l for l in listdir(download_dir + 'RawDownloads/') if l.endswith('.series.txt')] assert len(SPs) == 1, 'Wrong number of series paths.' serpath = SPs[0] parse_series(download_dir + 'RawDownloads/',download_dir + 'series.txt') delete(serpath)
def generate_metadata(objname,forced=False,use=100): metapath = opmetadatapath(objname) if IsDotPath(objname) else metadatapath(objname) if IsDotPath(objname): path = '../' + '/'.join(objname.split('.')[:-1]) + '.py' objectname = objname.split('.')[-1] else: path = objname objectname ='' if forced or not PathExists(metapath) or os.path.getmtime(metapath) <= FindMtime(path,objectname=objectname,Simple=False): if IsDir(objname): if objname[-1] != '/': objname += '/' if is_hsv_dir(objname): pass else: D = {} L = [objname + ll for ll in listdir(objname) if not ll.startswith('.')] for l in L: D.update(generate_metadata(l,forced=forced)) LL = set(L).intersection(D.keys()) D[objname] = IntegrateDirMetaData([D[l] for l in LL]) return D else: if IsPythonFile(objname) or IsDotPath(objname): d = StoredDocstring(objname) if d: return {objname:{'description':d,'signature': 'python'}} elif objname.endswith(('.csv','.tsv')): if IsFile(objname): try: x = tabularmetadata(objname,use=use) except: x = DEFAULT_GenerateAutomaticMetaData(objname) print 'Failed to tabular metadata for', objname print_exc() else: x['signature'] = 'tabular' return {objname : x} return {} else: try: return {objname:pickle.load(open(metapath+'/AutomaticMetaData.pickle','r'))} except: return generate_metadata(objname,forced=True)
def parse_series(datadir,outpath,units=''): SPs = [datadir + l for l in listdir(datadir) if l.endswith('.series.txt')] assert len(SPs) == 1, 'Wrong number of series paths.' serpath = SPs[0] F = open(serpath,'rU') names = F.readline().rstrip('\n').split('\t') codefiles = {} codenames = {} bases = {} for name in names: base = identifybase(name) basepath = identifybasepath(base,datadir) if basepath != None: print name, basepath codefile = CleanOpen(basepath) if codefile != None: codename = identifycode(base,codefile.dtype.names) if codename != None: codenames[name] = codename codefiles[name] = codefile[[n for n in codefile.dtype.names if n.startswith(base)]] bases[name] = base else: print '\n\nWARNING: Problem with code for' , name , 'in file', basepath, '\n\n' else: print '\n\nWARNING: Can\'t seem to open', basepath else: print '\n\nWARNING: Problem with finding basepath for ', name , 'in', datadir blocksize = 750000 done = False while not done: lines = [F.readline().rstrip('\n').split('\t') for i in range(blocksize)] lines = [l for l in lines if l != ['']] if len(lines) > 0: X = tb.tabarray(records = lines,names = names) NewCols = [] NewNames = [] for name in names: if name in codenames.keys(): codefile = codefiles[name] base = bases[name] codename = codenames[name] Xn = np.array([xx.strip() for xx in X[name]]) Cn = np.array([xx.strip() for xx in codefile[codename]]) [S1,S2] = tb.fast.equalspairs(Xn,Cn) NewCols += [codefile[n][S1] for n in codefile.dtype.names if n != codename] NewNames += [n for n in codefile.dtype.names if n != codename] X = X.addcols(NewCols, names = NewNames) X.coloring['NewNames'] = NewNames if units != '': if ' ' not in units: if units: X.coloring['Units'] = [units] elif not units.startswith('if '): X = X.addcols([[units]*len(X)], names=['Units']) else: X = X.addcols([[rec['earn_text'] if rec['tdata_text'] == 'Person counts (number in thousands)' else rec['pcts_text'] for rec in X]], names=['Units']) tb.io.appendSV(outpath,X,metadata=True) else: done = True