def tabularmetadataforms(pathlist, depends_on=WORKING_DE.relative_metadata_dir): attlist = ['description', 'author', 'title', 'keywords'] recs1 = [] recs2 = [] for x in pathlist: print x mdp = metadatapath(x) + '/ProcessedMetaData.pickle' if PathExists(mdp): M = pickle.load(open(mdp)) D = {} for att in attlist: if att in M.keys(): D[att] = M[att] else: D[att] = '' recs1.append((x, ) + tuple([D[att].replace('\n', ' ') for att in attlist])) colnames = M['colnames'] if 'coldescrs' in M.keys(): coldescrs = [ M['coldescrs'][m] if m in M['coldescrs'].keys() else '' for m in colnames ] else: coldescrs = [''] * len(colnames) recs2 += zip([x] * len(colnames), colnames, coldescrs) X = tb.tabarray(records=recs1, names=['Path'] + attlist) Y = tb.tabarray(records=recs2, names=['Path', 'ColName', 'ColDescr']) return [X, Y]
def tabularmetadataforms(pathlist,depends_on = '../System/MetaData/'): attlist = ['description','author','title','keywords'] recs1 = [] recs2 = [] for x in pathlist: print x mdp = metadatapath(x) + '/ProcessedMetaData.pickle' if PathExists(mdp): M = pickle.load(open(mdp)) D = {} for att in attlist: if att in M.keys(): D[att] = M[att] else: D[att] = '' recs1.append((x,) + tuple([D[att].replace('\n',' ') for att in attlist])) colnames = M['colnames'] if 'coldescrs' in M.keys(): coldescrs = [M['coldescrs'][m] if m in M['coldescrs'].keys() else '' for m in colnames] else: coldescrs = ['']*len(colnames) recs2 += zip([x]*len(colnames),colnames,coldescrs) X = tb.tabarray(records = recs1,names=['Path'] + attlist) Y = tb.tabarray(records = recs2,names = ['Path','ColName','ColDescr']) return [X,Y]
def ParseFiles(depends_on=(root_wf + '1To10000.html', root_wf + '10001To20000.html', root_wf + '20001To30000.html', root_wf + '30001To40000.html'), creates=root_wf + 'WordFrequencies.csv'): Words = [] Freqs = [] Rank = [] for (j, x) in enumerate(depends_on): Soup = BeautifulSoup(open(x, 'r')) P = Soup.findAll('p') count = 0 for (i, p) in enumerate(P): print 'processing', x, ', group', i A = p.findAll('a') if len(A) > 10: C = Contents(p).replace(' = ', ' ').split(' ') newwords = C[::2] newfreqs = C[1::2] Words += newwords Freqs += newfreqs Rank += range(1 + j * 10000 + count, 1 + j * 10000 + count + len(newwords)) count += len(newwords) tb.tabarray(columns=[Words, Freqs, Rank], names=['Word', 'Frequency', 'Rank']).saveSV(creates, delimiter=',')
def _normalize_array(x): flour = tb.tabarray(columns=[x['flour_all'] + x['flour_almond'] + \ x['flour_bread'] + x['flour_cake'] + x['flour_other'] + \ x['flour_rice'] + x['flour_wheat'] + x['flour_rye']], \ names=['flour']) liquid = tb.tabarray(columns=[x['milk'] + x['water'] + \ x['soymilk'] + x['buttermilk'] + x['juice_apple'] + \ x['juice_can'] + x['juice_lemon'] + x['juice_lime'] + \ x['juice_orange'] + x['juice_other'] + x['juice_pineapple']], \ names=['liquid']) fat = tb.tabarray(columns=[x['butter'] + x['cream_cheese'] + \ x['cream_half'] + + x['cream_ice'] + x['cream_other'] + \ x['cream_tartar'] + x['cream_whipped'] + x['margarine'] + \ x['oil_canola'] + x['oil_olive'] + x['oil_other'] + \ x['oil_vegetable'] + x['lard'] + x['shortening']], \ names=['fat']) sugar = tb.tabarray(columns=[x['sugar_brown'] + x['sugar_powder'] + \ x['sugar_white']], names=['sugar']) a = x[['egg']].colstack(fat).colstack(flour).colstack(liquid)\ .colstack(sugar).extract() a = a / np.repeat(a.sum(axis = 1), a.shape[1])\ .reshape(a.shape[0], a.shape[1]) return a
def make_background_db( creates="../background_certificate.txt", depends_on=("../3d_hdr_backgrounds.csv", "../2d_grayscale_backgrounds.csv") ): conn = pm.Connection() db = conn["dicarlocox_3dmodels"] db.drop_collection("3d_spherical_backgrounds") coll = db["3d_spherical_backgrounds"] recs = [ {"name": "backlot", "path": "backlot.tdl"}, {"name": "apartment", "path": "apartment.tdl"}, {"name": "empty room", "path": "empty_room.tdl"}, {"name": "office", "path": "office.tdl"}, ] for rec in recs: coll.insert(rec) X = tb.tabarray(SVfile=depends_on[0]) recs = [{"name": x["Path"][:-4], "path": x["Path"], "description": x["Description"], "type": "3d hdr"} for x in X] for rec in recs: coll.insert(rec) X = tb.tabarray(SVfile=depends_on[1]) recs = [{"name": x["Path"][:-4], "path": x["Path"], "type": "2d grayscale"} for x in X] for rec in recs: coll.insert(rec) rec = {"name": "blank gray image", "path": "gray.td", "type": "blank"} coll.insert(rec)
def TestPivot2(): X = tb.tabarray(records=[('x', 1, 3, 6), ('y', 0, 3, 1), ('x', 0, 3, 5)], names=['a', 'b', 'c', 'd']) Y = X.pivot('b', 'a') Z = tb.tabarray(records=[(0, 3, 3, 5, 1), (1, 3, 0, 6, 0)], names=['b', 'x_c', 'y_c', 'x_d', 'y_d']) assert (Y == Z).all()
def specific_config_gen(IC,args): IC.base_dir = args['base_dir'] IC.annotate_dir = args['annotate_dir'] IC.groundtruth_dir = args['groundtruth_dir'] IC.correspondence = tb.tabarray(SVfile = args['frame_correspondence']) IC.size = args['size'] IC.prefix = prefix = args.get('image_extension','.jpg') IC.current_frame_path = None csvs = [x for x in os.listdir(IC.annotate_dir) if x.endswith('.csv')] csvs.sort() Xs = [tb.tabarray(SVfile = os.path.join(IC.annotate_dir,csv)) for csv in csvs] cns = [csv.split('.')[0] for csv in csvs] cns = [[cn]*len(X) for (cn,X) in zip(cns,Xs)] Xs = [X.addcols(cn,names=['clip_num']) for (cn,X) in zip(cns,Xs)] csvs = [x for x in os.listdir(IC.groundtruth_dir) if x.endswith('.csv')] csvs.sort() Gs = [] fields = ['clip_num','Frame'] + xfields + yfields for ind,csv in enumerate(csvs): try: g = tb.tabarray(SVfile = os.path.join(IC.groundtruth_dir,csv)) except: x = Xs[ind].addcols([-1]*len(Xs[ind]),names=['Correctness']) else: g = g.addcols([csv.split('.')[0]]*len(g),names = ['clip_num']) g = g[fields + ['Confidence']] g.renamecol('Confidence','Correctness') x = Xs[ind].join(g,keycols=fields) Gs.append(x) X = tb.tab_rowstack(Gs) X.sort(order=['clip_num','Frame']) Y = IC.correspondence F = tb.fast.recarrayisin(Y[['clip_num','Frame']],X[['clip_num','Frame']]) Y = Y[F] X = X.join(Y,keycols=['clip_num','Frame']) params = [] for t in X: print(t) cn = t['clip_num'] fr = t['Frame'] box = get_darpa_box(t) bb = box.pop('box') xc,yc = bb.center center = correct_center((xc,yc),IC.size,(1920,1080)) bb_new = bbox.BoundingBox(center = center,width = IC.size[0], height = IC.size[1]) p = SON([('size',IC.size), ('bounding_box',SON([('xfields',list(bb_new.xs)),('yfields',list(bb_new.ys))])), ('original_bounding_box',SON([('xfields',list(bb.xs)),('yfields',list(bb.ys))])), ('clip_num',cn), ('Frame',int(t['Original'])), ('base_dir',IC.base_dir), ('correctness',int(t['Correctness']))]) p.update(box) p['GuessObjectType'] = p['ObjectType'] p['ObjectType'] = p['ObjectType'] if t['Correctness'] == 1 else '' params.append(SON([('image',p)])) return params
def apply_binary(din='../data/compas', froot='compas', neg_names=None, prefix=''): ftrain = os.path.join(din, '%s%s_train.out' % (prefix, froot)) ftrain_label = os.path.join(din, '%s%s_train.label' % (prefix, froot)) ftest = os.path.join(din, '%s_test.csv' % froot) fout = os.path.join(din, '%s%s_test.out' % (prefix, froot)) flabel = os.path.join(din, '%s%s_test.label' % (prefix, froot)) x = tb.tabarray(SVfile=ftest) if (neg_names is None): neg_names = [n.replace(':', ':not-') for n in x.dtype.names] y = tb.tabarray(array=(1 - x.extract()), names=neg_names) names = list(x.dtype.names) + list(y.dtype.names) positive_label_name = x.dtype.names[-1] negative_label_name = y.dtype.names[-1] x = x.colstack(y) recs = [x[negative_label_name], x[positive_label_name]] x = x.extract() d = dict(zip(names, [x[:, i] for i in range(len(names))])) print 'reading rules from', ftrain rule_descr = [ line.strip().split()[0] for line in open(ftrain, 'rU').read().strip().split('\n') ] print 'extracting these rules from', ftest out = [] for descr in rule_descr: rule = [clause for clause in descr.strip('{}').split(',')] bv = np.cast[str](np.cast[int](np.array([ (d[name] == 1) for name in rule ]).all(axis=0))) out.append('%s %s' % (descr, ' '.join(bv))) print 'writing', fout f = open(fout, 'w') f.write('\n'.join(out) + '\n') f.close() print 'writing', flabel labels = [ line.split()[0] for line in open(ftrain_label, 'rU').read().strip().split('\n') ] f = open(flabel, 'w') f.write('\n'.join([ '{%s} %s' % (l, ' '.join(np.cast[str](np.cast[int](r)))) for (l, r) in zip(labels, recs) ]) + '\n') f.close()
def test_bionumbers(): X = tb.tabarray(SVfile = 'tests/bionumbers.txt') fname = TestDataDir + 'bionumbers.txt' X.saveSV(fname, quotechar="'") Y = tb.tabarray(SVfile = TestDataDir + 'bionumbers.txt',quotechar="'") names = ('ID', 'Property', 'Organism', 'Value', 'Units', 'Range', 'NumericalValue', 'Version') assert_bio(X.dtype.names == names and len(X) == 4615 and eq(X,Y), fname)
def test_missingvals4(self): fname = TestDataDir + 'missingvals4.csv' F = open(fname,'w') F.write('Name,Age,Gender\nDaniel,12,M\nElaine,'',F\nFarish,46,') F.close() X = tb.tabarray(SVfile=fname) X2 = tb.tabarray(records=[('Daniel', 12, 'M'), ('Elaine', np.nan, 'F'), ('Farish', 46, '')],names=['Name','Age','Gender']) self.assert_io(eq(X, X2), fname)
def makemetadata(code,datadir,outfile1,outfile2,depends_on = (resource_root + 'ProcessedManifest_2_HandAdditions.tsv',resource_root + 'Keywords.txt')): Z = {} keyword_file = depends_on[1] Y = tb.tabarray(SVfile = keyword_file)[['Code','Keywords']] y = Y[Y['Code'] == code] Z['keywords'] = [x.strip() for x in str(y['Keywords'][0]).split(',')] dirl = np.array(listdir(datadir)) pr = lambda x : x.split('!')[-1][:-4] p=re.compile('\([^\)]*\)') tps = [l for l in dirl if l.endswith('.txt.txt')] if tps: textpath = datadir + tps[0] [SD,things] = ParseTexts(textpath,code) FNs = [p.sub('',things[pr(y).lower()]).replace(' ,',',').replace(',,',',') if pr(y).lower() in things.keys() else '' for y in dirl] FNs = [z.split('=')[1] if '=' in z and not ' =' in z else z for z in FNs] else: SD = '' FNs = len(dirl)*[''] Z['description'] = SD cfs = [l for l in dirl if l.endswith('.contacts.txt')] if cfs: contactfile = datadir + cfs[0] ctext = open(contactfile,'rU').read().strip() if '<html>' in ctext.lower(): clines = ctext.split('\n') fb = [i for i in range(len(clines)) if clines[i].strip() == ''][0] ctext = '\n'.join(clines[fb+1:]) ctext = ctext.strip(' *\n').replace('\n\n','\n') else: ctext = '' Z['contactInfo'] = ctext f = open(outfile1,'w') pickle.dump(Z,f) f.close() Y = tb.tabarray(SVfile = depends_on[0]) Y.sort(order = ['File']) dirlp = np.array([pr(y) for y in dirl]) [A,B] = tb.fast.equalspairs(dirlp,Y['File']) if (B>A).any(): print 'adding hand-made content to', dirlp[B>A] for k in (B>A).nonzero()[0]: FNs[k] = Y['FileName'][A[k]] D = tb.tabarray(columns=[dirl,FNs], names = ['Path','FileName']) D.saveSV(outfile2,metadata = True)
def TestReplace2(): V1 = ['North', 'South', 'East', 'West'] V2 = ['Service', 'Manufacturing', 'Education', 'Healthcare'] Recs = [(a, b, np.random.rand() * 100, np.random.randint(100000)) for a in V1 for b in V2] X = tb.tabarray(records=Recs, names=['Region', 'Sector', 'Amount', 'Population']) X2 = tb.tabarray(records=Recs, names=['Region', 'Sector', 'Amount', 'Population']) X.replace('S', 'M') assert((X == X2).all())
def get_subclass_pages(): X = tb.tabarray(SVfile = 'catlevels.tsv') recs = [] p = re.compile('Sub\d') f = lambda x : p.match(dict(x.attrs).get('class','')) for x in X[:]: subrecs = [] cat = x['CLASS'] fixed_cat = fix_cat(cat) title = x['CLASS TITLE'] os.system('wget http://www.uspto.gov/web/patents/classification/uspc' + fixed_cat + '/sched' + fixed_cat + '.htm -O ' + cat + '.html') Soup = BeautifulSoup.BeautifulSoup(open(cat + '.html')) Crac = Soup.find(True,'CracHeader') For = Soup.find(True,'ForHeader') Dig = Soup.find(True,'DigHeader') if Crac: end = Crac elif For: end = For elif Dig: end = Dig else: end = None if end: T = end.findAllPrevious('tr',valign='top')[:] else: T = Soup.findAll('tr',valign='top')[:] T.reverse() for (i,t) in enumerate(T): try: subclass = str(Contents(t.find(f)).replace(' ','').strip()) except: pass else: try: subtitle = Contents(t.find(True,"SubTtl")).strip() except: pass else: try: indent = int(dict(t.find(True,"SubTtl").find("img").attrs)['src'].split('/')[-1].split('_')[0]) except AttributeError: indent = 0 #print (cat,title,subclass,subtitle,indent) subrecs.append((cat,title,subclass,subtitle,indent)) subrecs.reverse() recs.extend(subrecs) Y = tb.tabarray(records = recs, names=['Class','Title','Subclass','Subtitle','Indent'],formats=['str','str','str','str','int']) Y.saveSV('classifications.tsv',metadata=True)
def TestPivot3(): V1 = ['NorthAmerica', 'SouthAmerica', 'Europe', 'Asia', 'Australia', 'Africa', 'Antarctica'] V1.sort() V2 = ['House', 'Car', 'Boat', 'Savings', 'Food', 'Entertainment', 'Taxes'] V2.sort() Recs = [(a, b, 100 * np.random.rand()) for a in V1 for b in V2] X = tb.tabarray(records=Recs, names=['Region', 'Source', 'Amount']) Y = X.pivot('Region', 'Source') Z = utils.uniqify(X['Source']) Z.sort() Cols = [[y['Amount'] for y in X if y['Source'] == b] for b in Z] W = tb.tabarray(columns=[V1] + Cols, names=['Region'] + [b + '_Amount' for b in Z]) assert (W == Y).all()
def TestPivot4(): V1 = ['NorthAmerica', 'SouthAmerica', 'Europe', 'Asia', 'Australia', 'Africa', 'Antarctica'] V1.sort() V2 = ['House', 'Car', 'Boat', 'Savings', 'Food', 'Entertainment', 'Taxes'] V2.sort() Recs = [(a, b, 100 * np.random.rand()) for a in V1 for b in V2] X = tb.tabarray(records=Recs[:-1], names=['Region', 'Source', 'Amount']) Y = X.pivot('Region', 'Source', NullVals=dict([(o,-999) for o in X.dtype.names])) X2 = tb.tabarray(records=Recs, names=['Region', 'Source', 'Amount']) Y2 = X.pivot('Region','Source') Y2[V2[-1] + '_Amount'][-1] = -999 assert (Y == Y2).all()
def test_load_save_TSV_infer2(self): fname = TestDataDir + 'test2.tsv' self.X.saveSV(fname, printmetadict=False, metadata=['coloring', 'names']) X2 = tb.tabarray(SVfile=fname, metametadata={'coloring': 0, 'names': 1}) self.assert_io(eq(self.X, X2), fname)
def get_results(mean,std,ext_hash,splitfilename,outfile): conn = pm.Connection(document_class = SON) db = conn['thor'] fcol = db['features.files'] split_fs = gridfs.GridFS(db,'split_performance') fh = split_fs.get_version(splitfilename) r = cPickle.loads(fh.read()) r = r['split_result']['cls_data'] weights = r['coef'] bias = r['intercept'] L = fcol.find({'__hash__':ext_hash},fields=['image.clip_num','image.Frame','feature','image.bounding_box']) recs = [] names = ['clip_num','frame','x1','x2','x3','x4','y1','y2','y3','y4'] + labels for l in L: cn = str(l['image']['clip_num']) fr = l['image']['Frame'] print(l['_id'],cn,fr) bx = l['image']['bounding_box']['xfields'] by = l['image']['bounding_box']['yfields'] feat = l['feature'] feat = (feat - mean)/std m = sp.dot(feat,weights) + bias rec = (cn,fr,) + tuple(bx) + tuple(by) + tuple(m) recs.append(rec) if len(recs) == 10000: X = tb.tabarray(records = recs, names = names) tb.io.appendSV(outfile,X,metadata=True) recs = []
def setUp(self): self.D = tb.tabarray( array=[(2, 'a', 2, 'cc', 3.0), (2, 'b', 5, 'dcc', 2.0), (7, 'e', 2, 'cjc', 8.0), (2, 'e', 2, 'ccj', 3.0)], names=['a', 'c', 'b', 'd', 'e'], formats='i4,|S1,i4,|S3,f8', coloring={'moo': ['a', 'b'], 'boo': ['a', 'd', 'e']}) self.Root = 'basic'
def setUp(self): names = ['name', 'ID', 'color', 'size', 'June', 'July'] data = [('bork', 1212, 'blue', 'big', 45.32, 46.07), ('mork', 4660, 'green', 'small', 32.18, 32.75), ('stork', 2219, 'red', 'huge', 60.93, 61.82), ('lork', 4488, 'purple', 'tiny', 0.44, 0.38)] self.x = tb.tabarray(records=data, names=names)
def test_toload_redundant_tsv(self): toload = ['a', 'boo'] fname = TestDataDir + self.Root + '6.tsv' self.D.saveSV(fname, metadata=['names', 'formats', 'types', 'coloring', 'dialect']) D = tb.tabarray(SVfile=fname, usecols=toload) assert set(D.dtype.names) == set(D.coloring['boo']) self.assert_io(eq(self.D[toload], D[toload]), fname)
def test_strictjoin3(self): X = self.X keycols = self.keycols others=self.others X1 = X[:(3 * len(X) / 4)][keycols + others[0]] X2 = X[(len(X) / 4):][keycols + others[1]] Y = spreadsheet.strictjoin([X1, X2], self.keycols) Y.sort(order=keycols) nvf = utils.DEFAULT_NULLVALUEFORMAT nvf1 = nvf(X[others[1][0]].dtype.descr[0][1]) nvf2 = nvf(X[others[1][1]].dtype.descr[0][1]) nvf3 = nvf(X[others[0][0]].dtype.descr[0][1]) nvf4 = nvf(X[others[0][1]].dtype.descr[0][1]) Recs = ([(a, b, c, d, nvf1, nvf2) for (a, b, c, d, e, f) in X[:(len(X) / 4)]] + [(a, b, c, d, e, f) for (a, b, c, d, e, f) in X[(len(X) / 4):(3 * len(X) / 4)]] + [(a, b, nvf3, nvf4, e, f) for (a, b, c, d, e, f) in X[(3 * len(X) / 4):]]) Z = tb.tabarray(records=Recs, names=X.dtype.names) Z.sort(order=self.keycols) self.assert_((Y == Z).all())
def test_load_save_CSV_infer(self): fname = TestDataDir + 'test.csv' self.X.saveSV(fname) X2 = tb.tabarray(SVfile=fname) # normal scenario: names, no comments Z = self.X.copy() Z.coloring = {} self.assert_io(eq(Z, X2), fname)
def applysplitter(manifest,splitdir): MakeDir(splitdir) M = tb.tabarray(SVfile = manifest) vals = tb.uniqify(M['Prefix']) for v in vals: Mv = M[M['Prefix'] == v] Mv.saveSV(splitdir + 'Manifest_' + pathprocessor([v]) + '.tsv', metadata=True)
def test_load_save_TSV_infer(self): fname = TestDataDir + 'test.tsv' self.X.saveSV(fname) X2 = tb.tabarray(SVfile=fname) Z = self.X.copy() Z.coloring = {} self.assert_io(eq(Z, X2), fname)
def get_meta(selected_basic_objs=SELECTED_BASIC_OBJS): """Mix the objectome 64 basic-level set and the chair subordinate level set""" assert len(np.unique(selected_basic_objs)) == 30 meta_chairs = pk.load(open('meta_objt_chairs_subord_v3.pkl')) meta_basic = pk.load(open('meta_objt_full_64objs.pkl')) si = [ i for i, e in enumerate(meta_basic) if e['obj'] in selected_basic_objs ] assert len(si) == 30 * 1000 cnames = list(meta_chairs.dtype.names) assert list(meta_basic.dtype.names) == cnames cnames.remove('internal_canonical') cnames.remove('texture') # contains None cnames.remove('texture_mode') # contains None meta = tb.tabarray(columns=[ np.concatenate([meta_basic[e][si], meta_chairs[e]]) for e in cnames ], names=cnames) assert len(meta) == 30 * 1000 * 2 assert len(np.unique(meta['obj'])) == 60 # 30 non-chairs + 30 chairs return meta, meta_basic, meta_chairs
def check_stamps(metadatafile,imagedir,train_frames,outdir): os.mkdir(outdir) metadata = tb.tabarray(SVfile=metadatafile) #get labels for training objects train_labels_inds = [] for cn,fr in train_frames: inds = ((metadata['Frame'] == fr) & (metadata['clip_num'] == cn) & (metadata['ObjectType'] != 'DCR')).nonzero()[0] #ind = inds[t['object_number']] train_labels_inds.extend(inds) train_labels = metadata[train_labels_inds] #get stamps for training objects train_points = [] train_points_labels = [] sizes = [] num_train = 0 for label in train_labels: lbl = label['clip_num'] + '_' + str(label['Frame']) + '.jpg' print(label) framefile = os.path.join(imagedir,lbl) im = get_image(framefile) box = bbox.BoundingBox(xs = [label[xf] for xf in xfields], ys = [label[yf] for yf in yfields]) stamp = bbox.stamp(im,box,stamp_shape=(200,200))[0] if stamp is not None: img = Image.fromarray(stamp) img.save(os.path.join(outdir,str(num_train) + '.jpg')) num_train += 1
def parser2(): files_to_parse = [x for x in os.listdir('.') if x.endswith('_file.html')] for file in files_to_parse: print('parsing',file) #next step of getting data for each water system H20Systems = BeautifulSoup.BeautifulSoup(open(file)) table = H20Systems.findAll('table')[3].findAll('table')[7] TR = table.findAll('tr',bgcolor='#F5F5F5') Links = [str(dict(tr.findAll('td')[1].findAll('a')[0].attrs)['href']) for tr in TR] Names = [utils.Contents(tr.findAll('td')[1]) for tr in TR] Number = [utils.Contents(tr.findAll('td')[2]).replace(' ',' ') for tr in TR] County = [utils.Contents(tr.findAll('td')[3]) for tr in TR] outname = file.split('_')[0] + '_file.tsv' tb.tabarray(columns = [Links,Names,Number,County],names=['Links','Names','Number','County']).saveSV(outname,metadata=True)
def get_data(self): trial_records = [] obj_oi = np.unique(self.meta["obj"]) img_oi = np.unique(self.meta["id"]) for subj in self.data: for r, i, ss in zip(subj["Response"], subj["ImgData"], subj["StimShown"]): if len(i) > 1: s = i["Sample"] t = i["Test"] s_id = s["id"] s_obj = s["obj"] t_obj = [t_["obj"] for t_ in t] d_obj = [t_ for t_ in t_obj if t_ != s_obj][0] resp = t_obj[r] else: # backwards compatibility with previous mturkutils s_id = i[0]["id"] s_obj = i[0]["obj"] t_obj = [strip_objectomeFileName(fn) for fn in ss[1:]] d_obj = [t_ for t_ in t_obj if t_ != s_obj][0] resp = strip_objectomeFileName(r) if (s_id in img_oi) & (d_obj in obj_oi): rec_curr = (s_obj,) + (d_obj,) + (resp,) + (s_id,) + (subj["WorkerID"],) + (subj["AssignmentID"],) trial_records.append(rec_curr) self.trials = tb.tabarray(records=trial_records, names=self.trial_kw_names, formats=self.trial_kw_formats) return
def create_metadata(): # Metadata name = 'riskfactors' source = { "agency": {"shortName": "DHHS", "name": "Department of Health and Human Services"}, "subagency": {"shortName": "CDC", "name": "Centers for Disease Control and Prevention"}, "topic": {"name": "Health and Nutrition"}, #"subtopic": {"name": "Release Quantity Data"}, "program": {"shortName": "CHSI", "name": "Community Health Status Indicators"}, "dataset": {"shortName": "ohdc", "name": "Community Health Status Indicators (CHSI) to Combat Obesity, Heart Disease and Cancer"} } Y = tb.tabarray(SVfile = 'DATAELEMENTDESCRIPTION.csv') metadata = { 'title':'Community Health Status Indicators (CHSI) to Combat Obesity, Heart Disease and Cancer', 'description':"Community Health Status Indicators (CHSI) to combat obesity, heart disease, and cancer are major components of the Community Health Data Initiative. This dataset provides key health indicators for local communities and encourages dialogue about actions that can be taken to improve community health (e.g., obesity, heart disease, cancer). The CHSI report and dataset was designed not only for public health professionals but also for members of the community who are interested in the health of their community. The CHSI report contains over 200 measures for each of the 3,141 United States counties. Although CHSI presents indicators like deaths due to heart disease and cancer, it is imperative to understand that behavioral factors such as obesity, tobacco use, diet, physical activity, alcohol and drug use, sexual behavior and others substantially contribute to these deaths.", 'keywords':['Obesity','CHSI','health','data','community','indicators','interventions','performance','measurable','life expectancy','mortality','disease','prevalence','risk','factors','behaviors','socioeconomic','environments','access','cost','quality','warehouse','heart','cancer'], 'uniqueIndexes':['Location'], 'sliceCols':[['Location']], 'columnGroups':{ 'spaceColumns':['Location'], 'labelColumns':['Location'] }, 'columnDescriptions': dict([(y['COLUMN_NAME'],y['DESCRIPTION']) for y in Y]), 'source':{ "agency": {"shortName": "DHHS", "name": "Department of Health and Human Services"}, "subagency": {"shortName": "CDC", "name": "Centers for Disease Control and Prevention"}, "topic": {"name": "Health and Nutrition"}, #"subtopic": {"name": "Release Quantity Data"}, "program": {"shortName": "CHSI", "name": "Community Health Status Indicators"}, "dataset": {"shortName": "ohdc", "name": "Community Health Status Indicators (CHSI) to Combat Obesity, Heart Disease and Cancer"} } } return metadata
def FindPtime(target,Simple=False): ''' Returns last time, according to runtime meta data, that a target was succesfully created, if it is created data. ''' metapath = metadatapath(target) + '/CreationRecord.csv' if PathExists(metapath): try: Data = tb.tabarray(SVfile = metapath,delimiter = ',', lineterminator='\n') if len(Data) > 0: Data.sort(order=['TimeStamp']) if any(Data['ExitType'] == 'Success'): MostRecentSuccess = Data[Data['ExitType'] == 'Success']['TimeStamp'][-1] MoreRecentFailures = Data[(Data['ExitType'] == 'Failure') & (Data['TimeStamp'] > MostRecentSuccess)] if len(MoreRecentFailures) > 0: LeastRecentFailure = MoreRecentFailures['TimeStamp'][0] else: LeastRecentFailure = numpy.inf return Data[(Data['TimeStamp'] >= MostRecentSuccess) & (Data['TimeStamp'] < LeastRecentFailure)]['TimeStamp'][-1] else: return numpy.nan else: return numpy.nan except: return numpy.nan else: pass else: return numpy.nan
def create_exp_plan(self): """Define each trial's parameters """ df = pandas.read_csv('../data/snodgrass.csv', sep='\t') df['imgno'] = range(1, self.trials_per_hit + 1) df = pandas.concat([df for i in range(self.nsubj)]) df['subjid'] = np.repeat(range(self.nsubj), self.trials_per_hit) df['order'] = np.hstack([ np.random.permutation(self.trials_per_hit) for i in range(self.nsubj) ]) # df['kind'] = np.repeat(['color', 'gray', 'silhouette'], self.trials_per_hit * self.nsubj // 3) df['kind'] = np.repeat(['color'], self.trials_per_hit * self.nsubj) df['isi1'] = 500 df['stim_dur'] = 100 df['isi2'] = 500 df['subj_resp'] = None df['acc'] = np.nan df['rt'] = np.nan df = df.sort_values(by=['subjid', 'order']) rec = df.to_records(index=False) exp_plan = tb.tabarray(array=rec, dtype=rec.dtype) if self.save: self.save_exp_plan(exp_plan) return exp_plan
def fetch(self, wnids, seed=None, num_per_synset=100, firstonly=False, path=os.getcwd(), username='******', accesskey='bd662acb4866553500f17babd5992810e0b5a439'): """ Stores a random #num images for synsets specified by wnids from the latest release to path specified Since files are stored as tar files online, the entire synset must be downloaded to access random images. If 'all' is passed as the num argument, all images are stored. If the argument firstonly is set to true, then download times can be reduced by only extracting the first few images This method overwrites previous fetches: files and metadata are deleted """ if not os.path.exists(path + '/'): os.makedirs(path + '/') wnids = list(wnids) random.seed(seed) kept_names = [] kept_wnid_list = [] if hasattr(self, '_meta'): files_to_remove = np.unique(self.meta['filename']) for file_to_remove in files_to_remove: try: print path + '/' + file_to_remove os.remove(path + '/' + file_to_remove) except OSError: print "metadata is stale, clear cache directory" for i, wnid in enumerate(wnids): synset_names = [] url = 'http://www.image-net.org/download/synset?' + \ 'wnid=' + str(wnid) + \ '&username='******'&accesskey=' + accesskey + \ '&release=latest' print i url_file = urlopen(url) tar_file = tarfile.open(fileobj=url_file, mode='r|') if firstonly and not (num_per_synset == 'all'): keep_idx = xrange(num_per_synset) for i in keep_idx: tarinfo = tar_file.next() synset_names.append(tarinfo.name) tar_file.extract(tarinfo, path) else: for tarinfo in tar_file: synset_names.append(tarinfo.name) tar_file.extract(tarinfo, path) if num_per_synset == 'all': keep_idx = range(len(synset_names)) else: keep_idx = sample(range(len(synset_names)), num_per_synset) files_to_remove = frozenset(synset_names) - frozenset([synset_names[idx] for idx in keep_idx]) for file_to_remove in files_to_remove: os.remove(path + '/' + file_to_remove) kept_names.extend([synset_names[idx] for idx in keep_idx]) kept_wnid_list.extend([wnid] * len(keep_idx)) meta = tb.tabarray(records=zip(kept_names, kept_wnid_list), names=['filename', 'wnid']) self._meta = meta self.path = path tb.io.savebinary('imagenet_meta.npz', self._meta)
def get_alpha_images(): dataset_obj = objectome64s100alpha(internal_canonical=True) dataset_bg = objectome64s100bg(internal_canonical=True) img_res, img_size = 1024, 256 obj_imgs = dataset_obj.get_images({'dtype':'uint8', 'size': (img_res, img_res,4), 'normalize':False, 'mode':'RGBA'}, get_models=True) bg_imgs = dataset_bg.get_images({'dtype':'uint8', 'size': (img_res, img_res), 'normalize':False, 'mode':'L'}, get_models=True) IMGS = [] alphaval = [] for i in xrange(obj_imgs.shape[0]): background = Image.fromarray(bg_imgs[i]) foreground = obj_imgs[i] tmp_alpha = np.random.uniform(0.25,1) foreground[:,:,-1] = foreground[:,:,-1] * tmp_alpha alphaval.append(tmp_alpha) foreground = Image.fromarray(foreground) background.paste(foreground, (0, 0), foreground) IMGS.append(np.asarray(background)) meta = dataset_obj.meta names = meta.dtype.names + ('obj_alpha',) formats = zip(*meta.dtype.descr)[1] + ('float',) META = tb.tabarray(records=[tuple(meta[i]) + (alphaval[i],) for i in range(len(meta))], names=names, formats=formats) return IMGS, META
def get_monkeyturk_data(dataset="objectome24"): if dataset == "objectome24": meta_path = "/mindhive/dicarlolab/u/rishir/stimuli/objectome24s100/metadata.pkl" data_path = "/mindhive/dicarlolab/u/rishir/monkeyturk/allData.mat" meta = pk.load(open(meta_path, "r")) datmat = io.loadmat(data_path) uobjs = obj.models_combined24 col_data_seg = {} trial_records = [] subjs = ["Manto", "Zico", "Picasso", "Nano", "Magneto"] for sub in subjs: x = datmat["allData"][sub][0, 0] for xi in range(x.shape[0]): s_obj = uobjs[x[xi, 0]] d_obj = uobjs[x[xi, 2]] resp = uobjs[x[xi, 3]] s_id = meta[x[xi, 4] - 1]["id"] workid = sub assnid = "MonkeyTurk" rec_curr = (s_obj,) + (d_obj,) + (resp,) + (s_id,) + (workid,) + (assnid,) trial_records.append(rec_curr) col_data_seg["all"] = tb.tabarray(records=trial_records, names=KW_NAMES, formats=KW_FORMATS) for sub in subjs: t = col_data_seg["all"]["WorkerID"] == sub col_data_seg[sub] = col_data_seg["all"][t] return col_data_seg
def parse_lowest_level(): files_to_parse = utils.ListUnion([[os.path.join(x,y) for y in os.listdir(x)] for x in os.listdir('.') if x.endswith('DETAILS')]) kvpairs = [] for file in files_to_parse: print(file) Soup = BeautifulSoup.BeautifulSoup(open(file)) bolds = Soup.findAll('b') bolds = [b for b in bolds if utils.Contents(b).endswith(':')] newkvpairs = [(utils.Contents(b).strip(': '),utils.Contents(b.findNext()).strip()) for b in bolds][:-1] if len(bolds) > 0: newkvpairs.append((utils.Contents(bolds[-1]).strip(': '),''.join([utils.Contents(x) if utils.Contents(x) != '' else '\n' for x in bolds[-1].findNext().contents]))) kvpairs.append(newkvpairs) tb.tabarray(kvpairs = kvpairs).saveSV('final_results.tsv',metadata=True)
def download_images_by_synset( synsets, seed=None, num_per_synset='all', firstonly=False, path=None, imagenet_username='******', accesskey='bd662acb4866553500f17babd5992810e0b5a439'): """ Stores a random #num images for synsets specified by synsets from the latest release to path specified Since files are stored as tar files online, the entire synset must be downloaded to access random images. If 'all' is passed as the num argument, all images are stored. If the argument firstonly is set to true, then download times can be reduced by only extracting the first few images Returns a meta tabarray object containing wnid and filename for each downloaded image """ if path is None: path = os.getcwd() if not os.path.exists(path): os.makedirs(path) synsets = list(synsets) random.seed(seed) kept_names = [] kept_synset_list = [] for i, synset in enumerate(synsets): synset_names = [] url = 'http://www.image-net.org/download/synset?' + \ 'wnid=' + str(synset) + \ '&username='******'&accesskey=' + accesskey + \ '&release=latest' print i print url url_file = urlopen(url) tar_file = tarfile.open(fileobj=url_file, mode='r|') if firstonly and not (num_per_synset == 'all'): keep_idx = xrange(num_per_synset) for tarinfo in tar_file: synset_names.append(tarinfo.name) tar_file.extract(tarinfo, path) else: for tarinfo in tar_file: synset_names.append(tarinfo.name) tar_file.extract(tarinfo, path) if num_per_synset == 'all': keep_idx = range(len(synset_names)) else: keep_idx = sample(range(len(synset_names)), num_per_synset) files_to_remove = frozenset(synset_names) - frozenset( [synset_names[idx] for idx in keep_idx]) for file_to_remove in files_to_remove: os.remove(path + '/' + file_to_remove) kept_names.extend([synset_names[idx] for idx in keep_idx]) kept_synset_list.extend([synset] * len(keep_idx)) meta = tb.tabarray(records=zip(kept_names, kept_synset_list), names=['filename', 'synset']) return meta
def get(self): meta = tb.tabarray(SVfile=os.path.join(RESULTS_ROOT, 'meta_with_margin_test.tsv')) N = 3 days = 4 text = '' NP = 200 NN = 200 mdp = meta[:NP] mdp = mdp[np.random.RandomState(None).permutation(NP)]['filename'] mdn = meta[-NN:] mdn = mdn[np.random.RandomState(None).permutation(NN)]['filename'] for d in range(days): text += '<div class="entry" id="day_header_%d"><div class="entryheader"><h2 class="entrytitle">Day %d</h2>' % (d, d+1) text += '<div class="date">posted by <a href="/vistats_blog">Vistats</a> on 2013.06.19</div></div>' ctext = "So, here are my Chanel pics of the day :)" text += '<p class="chanel_header" id="chanel_header_day_%d">%s</p><br/>' % (d, ctext) chaneltext = '<div class="img_div" id="chanel_img_div_day_%d">' % d + ''.join(['<div class="show_img" id="chanel_img_day_%d_img_%d"><img src="%s/%s"></div>' % (d, _i, IMG_ROOT,x.split('/')[-1]) for _i, x in enumerate(mdp[d*N:(d+1)*N])]) + '</div>' text += chaneltext notchaneltext = '<div class="img_div" id="not_chanel_img_div_day_%d">' % d + ''.join(['<div class="show_img" id="not_chanel_img_day_%d_img_%d"><img src="%s/%s"></div>' % (d, _i, IMG_ROOT, x.split('/')[-1]) for _i, x in enumerate(mdn[d*N:(d+1)*N])]) + '</div>' nctext = "<br/>Hey, and of course I also have a life <b>outside</b> of Chanel :)" text += '<p class="not_chanel_header" id="not_chanel_header_day_%d">%s</p><br/>' % (d, nctext) + notchaneltext text += '</div>' html = HTML_TEMPLATE % text self.write(html) self.finish()
def columnwider(narrow_tab): #{{{ """ Функция для расширения столбцов Принимает обычную таблицу, возвращает "раздутую" Из-за недоработок класса tabular колонки не могут расширяться динамически, поэтому на придется заранее вставить в конец файла поля заведомо бОльшей ширины. """ # crate fake row first_row = narrow_tab[:1] # возьмем первый ряд для определения типа колонок empty_tuple = () for i in first_row.dtype.names: if (type(first_row[i][0]).__name__) == 'string_': empty_tuple += (column_strut,) else: empty_tuple +=('',) wide_row = tb.tabarray(records=(empty_tuple,), names=list(first_row.dtype.names)) # now we have table from one empty wide row # stack them to input table wide_tab = narrow_tab.rowstack([wide_row]) # for now wide row is unnecessary # remove them wide_tab = wide_tab[:-1] return wide_tab
def HiAssocWords(page, depends_on=root_wf + 'WordFrequencies.csv'): CFreq = PageWordFreqs(page) if CFreq != None: WFreq = tb.tabarray(SVfile=depends_on, verbosity=0) WF = WFreq[tb.fast.isin( WFreq['Word'], CFreq['Word'], )] CC = CFreq.join(WF, keycols='Word', Names=['InPage', 'Overall']) N = float(CFreq['Frequency'].sum()) DD = (1 / N) * CC['Frequency_InPage'] - 10**(-9) * CC['Frequency_Overall'] s = DD.argsort()[::-1] DD = DD[s] CC = CC[s] CC = CC.addcols(DD, names='FrequencyDelta') return CC[['Word', 'Frequency_InPage', 'FrequencyDelta']] else: return None
def getcategorydata(code,depends_on = os.path.join(DATA_ROOT ,'BLS_Hierarchy','Manifest_1.tsv')): manifest = depends_on X = tb.tabarray(SVfile = manifest) Codes = np.array([x.split('/')[-2] for x in X['URL']]) x = X[Codes == code][0] topic = str(x['Level1']) subtopic = str(x['Level2']) xx = str(x['Level3']) if len(xx.split(':')) > 1 and '-' in xx.split(':')[1]: Dataset = xx.split(':')[0].strip() y = ':'.join(xx.split(':')[1:]).strip('() ') ProgramName = y.split('-')[0].strip() ProgramAbbr = y.split('-')[1] elif xx.strip().endswith(')'): Dataset = xx[:xx.find('(')].strip() ProgramAbbr = xx[xx.find('('):].strip(' ()') if not ProgramAbbr.isupper(): ProgramAbbr = '' ProgramName = '' else: Dataset = xx ProgramName = '' ProgramAbbr = '' if code == 'jt': ProgramName = 'JOLTS' return {'Topic':topic,'Subtopic':subtopic,'Dataset':Dataset,'ProgramName':ProgramName,'ProgramAbbr':ProgramAbbr,'DatasetCode':code}
def LastTimeChanged(path): ''' Returns last time, according to runtime meta data, that a file (at "path") was actually modified (e.g. not simply overwritten, but actually modified.) ''' actualmodtime = os.path.getmtime(path) if actualmodtime == FindPtime(path): try: Data = tb.tabarray(SVfile=metapath, delimiter=',', lineterminator='\n') if len(Data) > 0: Data.sort(order=['TimeStamp']) Diffs = Data['Diff'].nonzero()[0] if len(Diffs) > 0: return Data['TimeStamp'][Diffs[-1]] else: return actualmodtime else: return actualmodtime except: return actualmodtime else: return actualmodtime
def get_meta(selected_basic_objs=SELECTED_BASIC_OBJS, meta_cars=META_CARS, meta_tanks=META_TANKS, meta_basic=META_BASIC): """Mix the objectome 64 basic-level set and the car/tank subordinate level set""" assert len(np.unique(selected_basic_objs)) == 22 si = [i for i, e in enumerate(meta_basic) if e['obj'] in selected_basic_objs] assert len(si) == 22 * 1000 meta = meta_basic[si] for meta_subord in [meta_cars, meta_tanks]: cnames = list(meta_subord.dtype.names) assert list(meta_basic.dtype.names) == cnames cnames.remove('internal_canonical') cnames.remove('texture') # contains None cnames.remove('texture_mode') # contains None meta = tb.tabarray( columns=[np.concatenate([meta[e], meta_subord[e]]) for e in cnames], names=cnames) assert len(meta) == (22 + 30 + 30) * 1000 # 22 basic + 30 cars + 30 tanks assert len(np.unique(meta['obj'])) == 22 + 30 + 30 return meta, meta_basic, meta_cars, meta_tanks
def test_load_save_TSV_skiprows(self): fname = TestDataDir + 'test3.tsv' self.X.saveSV(fname, printmetadict=False, metadata=['coloring', 'names']) X2 = tb.tabarray(SVfile=fname, skiprows=1) Z = self.X.copy() Z.coloring = {} self.assert_io(eq(Z, X2), fname)
def test_load_save_TSV_nocomments(self): fname = TestDataDir + 'test4.tsv' self.X.saveSV(fname, printmetadict=False, metadata=['coloring', 'names'], comments='') X2 = tb.tabarray(SVfile=fname, headerlines=2) Z = self.X.copy() Z.coloring = {} self.assert_io(eq(Z, X2), fname)
def test_usecols(self): fname = TestDataDir + 'usecols.tsv' self.x.saveSV(fname) x = tb.tabarray(SVfile=fname, usecols=[0,-1]) names=[self.x.dtype.names[i] for i in [0,-1]] print x,x.dtype.names print self.x[names],names self.assert_io(eq(x, self.x[names]), fname)
def test_nohash(self): fname = TestDataDir + 'nohash.tsv' self.x.saveSV(fname, comments='') f = open(fname, 'r').read() g = open(fname, 'w') g.write('this is my file\n' + f) g.close() x = tb.tabarray(SVfile=fname, headerlines=2) self.assert_io(eq(x, self.x), fname)
def test_linefixer(self): fname = TestDataDir + 'linefixer.txt' X1 = self.X.copy() X1.coloring = {} X1.saveSV(fname, delimiter='@') X2 = tb.tabarray(SVfile=fname, linefixer=(lambda x: x.replace('@','\t'))) self.assert_io(eq(X1, X2), fname)
def setUp(self): V1 = ['North', 'South', 'East', 'West'] V2 = ['Service', 'Manufacturing', 'Education', 'Healthcare'] Recs = [(a, b, np.random.rand() * 100, np.random.randint(100)) for a in V1 for b in V2] self.X = tb.tabarray(records=Recs, names=['Region', 'Sector', 'Amount', 'Population'], coloring={'zoo': ['Region','Sector'], 'york': ['Population','Sector','Region']})
def setUp(self): V1 = ['North', 'South', 'East', 'West'] V2 = ['Service', 'Manufacturing', 'Education', 'Healthcare'] Recs = [(a, b, np.random.rand() * 100, np.random.randint(100000), np.random.rand(), 'Yes' if np.random.rand() < .5 else 'No') for a in V1 for b in V2] self.X = tb.tabarray(records=Recs, names=['Region', 'Sector', 'Amount', 'Population', 'Importance', 'Modernized']) self.keycols = ['Region', 'Sector'] self.others = [['Amount', 'Population'], ['Importance', 'Modernized']]
def parse_imagenet_meta_data(self, results): """ Parses the meta data from tfrecords into a tabarray """ meta_keys = ["labels"] meta = {} for k in meta_keys: if k not in results: raise KeyError('Attribute %s not loaded' % k) meta[k] = np.concatenate(results[k], axis=0) return tb.tabarray(columns=[list(meta[k]) for k in meta_keys], names = meta_keys)