def savePopDB(dbsInfo): #read in the popularity jsons myF = fopen(os.path.join(baseDir, 'popDaily.txt')) myJInput = '' for line in myF: myJInput += line myJ = json.loads(myJInput) dates = myJ.keys() #make a dictionary of datasets on disk during the various time intervals #divide the number of accesses for each dataset by the number of files in the dataset #summing up accesses on all days during the interval. #TODO: Protect against data in the popularity jsons that is after the end date of the plot datasets = {} datasetDetails = {} startKeys = dStarts.keys() for dateStart in startKeys: datasets[dateStart] = {} for d in dates: dVal = getDatePop(d) records = myJ[d]['DATA'] for record in records: dataset = record["COLLNAME"] if dataset == testDS: print 'Dataset read', d, record["NACC"] if dataset not in dbsInfo: continue if 'num_files' in dbsInfo[dataset]: nFiles = dbsInfo[dataset]['num_files'] else: nFiles = dbsInfo[dataset]['nfiles'] if dataset not in datasetDetails: datasetDetails[dataset] = {} datasetDetails[dataset][dVal] = datasetDetails[dataset].get( dVal, 0) + float(record["NACC"]) / float(nFiles) for dateStart in startKeys: if dVal >= dStarts[dateStart] and dVal <= dEnd: datasets[dateStart][dataset] = datasets[dateStart].get( dataset, 0) + float(record["NACC"]) / float(nFiles) #some printout for dateStart in startKeys: print len(datasets[dateStart].keys()) print "popularity for", testDS for dateStart in startKeys: print dateStart + " :", datasets[dateStart].get(testDS, 0) fp = fopen(os.path.join(outputDir, 'popDBDetails.data.gz'), 'wb') cPickle.dump(datasetDetails, fp) fp.close()
def readDBSInfo(): dbsInfo = {} print("### use dbsInput %s" % dbsInput) with fopen(dbsInput) as istream: headers = None while True: line = istream.readline().replace('\n', '') if not line: break row = line.split(',') if not headers: headers = row continue rdict = dict(zip(headers, row)) dataset = rdict.pop('dataset').replace('"', '') for key in rdict.keys(): rdict[key] = float(rdict[key]) if 'date' in headers: rdict['creation_date'] = datetime.datetime.fromtimestamp( rdict['date']) elif 'creation_date' in headers: rdict['creation_date'] = datetime.datetime.fromtimestamp( rdict['creation_date']) dbsInfo[dataset] = rdict return dbsInfo
def solve(f=None): s = f if f else fopen(16).readline().strip() b = bin(int(s, 16))[2:] b = b.zfill(len(s) * 4) p = Parser(b) v = p.parse() return v
def run(): f = fopen(18) l = f.readline() t = Tree(branches=loads(l)) for l in f.readlines(): s = Tree(branches=loads(l)) t = t.combine_trees(s) print(t.magnitude())
def savePhedexMeans(): "Save effectirMeansDict to disk" effectiveMeansDict = recreate() key = intervalStartStrings['12'] + '_' + intervalEndString keysES = effectiveMeansDict.keys() for keyES in keysES: fname = os.path.join(outputDir, 'effectiveMeans' + keyES + '_' + key + '.data.gz') print("writing %s: %s" % (keyES, fname)) fp = fopen(fname, 'wb') cPickle.dump(effectiveMeansDict[keyES], fp) fp.close()
def run(): f = fopen(18) ll = f.readlines() m = 0 for i in ll: for j in ll: if i != j: c = Tree(branches=loads(i)) x = Tree(branches=loads(j)) c = c.combine_trees(x) mm = c.magnitude() if mm > m: m = mm print(m)
def read(): f = fopen(13) c = set() l = f.readline() while l != '\n': c.add(tuple(map(int, l.split(',')))) l = f.readline() i = [] l = f.readline() while l: a,v = l.split(' ')[2].split('=') i.append(tuple([a, int(v)])) l = f.readline() return (c,i)
def align_scanners(): scanners = [Scanner(l) for l in fopen(19).read().split('\n\n')] l = set([0]) scanners[0].position = (0, 0, 0) while len(l) < len(scanners): for i, s in enumerate(scanners): for j, c in enumerate(scanners): if s == c or i not in l or j in l: continue intersection = s.intersect(c) if intersection: s.align(c, intersection) l.add(j) return scanners
def collect_tweets(task, tags): ''' Collect tweets for tag, indefinitely and store in csv files ''' appKeys = kays.appKeys with fopen(task, newline='\n', encoding='utf-8') as f: keyIdx = 0 tagIdx = 0 # writer for csv writer = csv.writer(f) # save task to log writelog(task, tags) # collect tweets indefinitely by using all keys while True: print(time.ctime(), 'Collecting tweets...') # get the key key = appKeys[keyIdx] # create auth and api auth = tweepy.OAuthHandler(key['consumerAPIKey'], key['consumerAPISecretKey']) auth.set_access_token(key['accessToken'], key['accessTokenSecret']) api = tweepy.API(auth) # filter out retweets query = tags[tagIdx] + ' -filter:retweets' count = 0 # collect tweets and save try: for tweet in tweepy.Cursor(api.search, q=query).items(): user = tweet.user # escape text row = map(esc, [tweet.text, tweet.id, user.name, user.screen_name, user.location, user.description, user.followers_count, user.friends_count, user.listed_count, user.statuses_count, user.favourites_count, user.verified, user.default_profile_image, user.default_profile, user.protected, user.created_at]) writer.writerow(row) count = count+1 except Exception as e: # Wait for 10 mins and then start using next key print(time.ctime(), 'Got {} tweets'.format(count)) # if keyIdx+1 == len(appKeys): tagIdx = (tagIdx+1) % len(tags) keyIdx = (keyIdx+1) % len(appKeys) time.sleep(10 * 60)
def run(p, op, cmp): l = [*fopen(24)] s = [] for i in range(14): a = int(l[18 * i + 5].split()[-1]) b = int(l[18 * i + 15].split()[-1]) if a > 0: s += [(i, b)] continue j, b = s.pop() p = op(p, abs((a + b) * 10**(13 - [i, j][cmp(a, -b)]))) print(p)
def setup(fold): extra = [['D', 'D'], ['C', 'B'], ['B', 'A'], ['A', 'C']] f = list( zip(*map(lambda x: x.strip().replace('#', ''), fopen(23).readlines()[2:4]))) h = [None] * 11 for i in range(len(h)): if i in rooms and f: e = [] if fold: e = extra[0] extra = extra[1:] h[i] = [h[i]] + [f[0][0]] + e + [f[0][1]] f = f[1:] return h
def readPhedexMeans(): "Initialize effectiveMeansDict from data on disk" #read in the stored information from disk key = intervalStartStrings['12'] + '_' + intervalEndString keysES = ["All", "AnaOps", "AllOps", "MinusOne"] effectiveMeansDict = {} for keyES in keysES: fname = os.path.join(outputDir, 'effectiveMeans' + keyES + '_' + key + '.data.gz') print("reading %s: %s" % (keyES, fname)) fp = fopen(fname, 'rb') effectiveMeansDict[keyES] = cPickle.load(fp) fp.close() return effectiveMeansDict
def solve(m=True): s = Counter() for l in fopen(22).readlines(): i, c = l.split(' ') c = parse(c, m) if not c: continue i = 1 if i == "on" else -1 u = Counter() for e, ie in s.items(): t = mxmn(c, e) if neg(t): u[t] -= ie if i > 0: u[c] += i s.update(u) print(sum(prod(map(vol, cube)) * v for cube, v in s.items()))
def solve(first=True): f = fopen(4) draw, _, *numbers = f.readlines() cards = [[]] card_index = 0 for r in numbers: if r == '\n': cards.append([]) card_index += 1 continue cards[card_index].append(r.strip().split()) board = Board(cards) for d in draw.split(','): winner = board.mark_cards(d) if winner and (first or board.is_last_winner()): print(winner.calc(int(d))) break
def plotPopularity(arr, keys, popSource, dataLoc, figNum, iformat='png'): xVals = numpy.arange(len(arr)) yByCount = numpy.zeros(len(arr)) yBySizes = {} knownTimes = {} knownSamples = {} for key in keys: yBySizes[key] = numpy.zeros(len(arr)) sp = key.split() knownTimes[sp[0]] = 1 knownSamples[sp[1]] = 1 for i, a in enumerate(arr): yByCount[i] = (a[0]) for j, key in enumerate(keys): yBySizes[key][i] = a[j + 1] width = 0.25 figNum = figNum + 1 pylab.figure(figNum) pylab.bar(xVals, yByCount, width, color='r') pylab.xlabel('Number of accesses', fontsize=15) pylab.ylabel('Number of collections', fontsize=15) fname = os.path.join(outputDir, "plots/popNum_" + popSource + " " + dataLoc) if not os.path.isdir(os.path.join(outputDir, "plots")): os.makedirs(os.path.join(outputDir, "plots")) pylab.savefig(fname + '.' + iformat, format=iformat) pylab.ylim(0, numpy.amax(yByCount) * 1.1) ax = pylab.gca() ax.set_xticks(xVals + width / 2.) ax.set_xticklabels(xVals) figNum = figNum + 1 cols = ['r', 'g', 'b'] for sample in knownSamples: figNum = figNum + 1 pylab.figure(figNum) nBars = 0 tMaxes = numpy.zeros(len(knownTimes.keys())) plottedKeys = [] for key in keys: sp = key.split() if sp[1] != sample: continue nMonths = sp[0] plottedKeys.append(key) print yBySizes[key] print "Sum", key, sample, popSource, dataLoc, numpy.sum( yBySizes[key]) pylab.bar(xVals + nBars * width, yBySizes[key], width, color=cols[nBars], label=nMonths + " months, sum=" + "{0:.1f}".format(numpy.sum(yBySizes[key]))) tMaxes[nBars] = numpy.amax(yBySizes[key]) nBars = nBars + 1 pylab.xlabel('Number of accesses', fontsize=15) pylab.ylabel('Weighted total size', fontsize=15) pylab.ylim(0, numpy.amax(tMaxes) * 1.1) pylab.xlim(0, xVals[-1] + 1) ax = pylab.gca() ax.set_xticks(xVals + width * 1.5) xLabels = ["0 Old"] for i in range(len(xVals) - 2): xLabels.append(str(i)) xLabels.append(str(len(xVals) - 2) + "+") ax.set_xticklabels(xLabels) # ax.set_xticklabels(xVals) pylab.legend(loc='best') pylab.title("Samples considered: " + sample + ", popData=" + popSource + " data GID=" + dataLoc) fname = os.path.join( outputDir, "plots/popSize_" + sample + "_" + popSource + "_" + dataLoc) if not os.path.isdir(os.path.join(outputDir, "plots")): os.makedirs(os.path.join(outputDir, "plots")) pylab.savefig(fname + '.' + iformat, format=iformat) fname = os.path.join( outputDir, "data/popSize_" + sample + "_" + popSource + "_" + dataLoc + '.csv.gz') if not os.path.isdir(os.path.join(outputDir, "data")): os.makedirs(os.path.join(outputDir, "data")) fH = fopen(fname, 'w') # fH = fopen(os.path.join(outputDir, "popSize_"+sample+"_"+popSource+"_"+dataLoc+'.csv.gz'),'w') fH.write('NAccesses') for key in plottedKeys: sp = key.split() fH.write(',' + sp[0] + ' months (PB)') fH.write('\n') for k in range(len(xVals)): fH.write(str(xLabels[k])) for key in plottedKeys: fH.write(',' + str(yBySizes[key][k])) fH.write('\n') fH.close() return figNum
def saveClassAds(dbsInfo): "Generate class ads information and save it on disk" datasets = {} datasetDetails = {} startKeys = dStarts.keys() for dateStart in startKeys: datasets[dateStart] = {} print("### use classAdsInput %s" % classAdsInput) for root, dirs, files in os.walk(classAdsInput, topdown=False): for idx, name in enumerate(sorted(files)): nNull = 0 nNonNull = 0 nNullEvts = 0 fName = os.path.join(root, name) headers = [] skip = False with fopen(fName, 'rb') as istream: for line in istream: sp = line.split(',') if not headers: headers = sp continue # check if it is dataset-YYYYMMDD.csv file check = ('sum_evts' in headers) or\ ('num_events' in headers) or\ ('nevents' in headers) if not check: print("Skip %s" % fName) skip = True break if len(sp) < 6: continue if sp[0] == "null": nNull += 1 continue if sp[6] == "null": nNullEvts += 1 continue if sp[0] == "dataset": continue nNonNull += 1 try: ts = long(sp[5]) except: continue if ts > 25180904520: #its in milliseconds! ts = long(ts / 1000) #there are also bogus timestamps - some are easy to recover while ts > 25180904520: ts = long(ts / 1000) try: dVal = datetime.datetime.fromtimestamp( ts).date() #getDate(sp[8]) except ValueError: print 'skipping bad timestamp', ts, line continue dataset = sp[0] if "/DQMIO" in dataset: continue #there are no events... if dataset not in dbsInfo: continue if 'num_events' in dbsInfo[dataset]: nEvts = dbsInfo[dataset]["num_events"] elif 'nevents' in dbsInfo[dataset]: nEvts = dbsInfo[dataset]["nevents"] elif 'sum_evts' in dbsInfo[dataset]: nEvts = dbsInfo[dataset]["sum_evts"] if float(nEvts) < 1: nEvts = 1. if dataset not in datasetDetails: datasetDetails[dataset] = {} datasetDetails[dataset][ dVal] = datasetDetails[dataset].get( dVal, 0) + float(sp[6]) * 1000 / float(nEvts) for dateStart in startKeys: if dVal >= dStarts[dateStart] and dVal <= dEnd: datasets[ dateStart][dataset] = datasets[dateStart].get( dataset, 0) + float(sp[6]) * 1000 / float(nEvts) if not skip: try: print("%3d %s %s %s" % (idx, name, nNull / float(nNonNull + nNull + nNullEvts + 1e-5), nNullEvts / float(nNonNull + nNull + nNullEvts + 1e-5))) except: pass #make a dictionary of datasets on disk during the various time intervals #divide the number of accesses for each dataset by the number of files in the dataset #summing up accesses on all days during the interval. #TODO: Protect against data in the popularity jsons that is after the end date of the plot fp = fopen(os.path.join(outputDir, 'classads.data.gz'), 'wb') cPickle.dump(datasets, fp) fp.close() fp = fopen(os.path.join(outputDir, 'classadsDetails.data.gz'), 'wb') cPickle.dump(datasetDetails, fp) fp.close()
def readClassAds(): "Read class ads data and return back datasets dict" fp = fopen(os.path.join(outputDir, 'classads.data.gz'), 'rb') datasets = cPickle.load(fp) fp.close() return datasets
def readPopDB(): "Read PopDB data and return back datasets dict" fp = fopen(os.path.join(outputDir, 'popDBDetails.data.gz'), 'rb') datasets = cPickle.load(fp) fp.close() return datasets
default_pattern = [ 'abcefg', 'cf', 'acdeg', 'acdfg', 'bdcf', 'abdfg', 'abdefg', 'acf', 'abcdefg', 'abcdfg' ] def pattern_decoder(p): return Counter(''.join(p)) def translate(s, decoder): return tuple(sorted([decoder[x] for x in s])) def calc(i, o): decoder = pattern_decoder(i) return int(''.join(map(str, [t[translate(x, decoder)] for x in o]))) def get_output(line): return calc(*[x.strip().split(' ') for x in line]) t = {} default_count = pattern_decoder(default_pattern) for i, x in enumerate(default_pattern): k = translate(x, default_count) t[k] = i print(sum([get_output(e.split('|')) for e in fopen(8).readlines()]))
def shoot(xv, yv, f): xs, xe = f['x'] ys, ye = f['y'] cx, cy = 0, 0 my = 0 while cx < xe and cy not in range(ys, ye): cx += xv cy += yv if cy > my: my = cy yv -= 1 if cx in range(xs, xe) and cy in range(ys, ye): return my if cx > xe: return None return 0 def get_max_y(f): my = 0 for x in range(1, 100): for y in range(1, 100): high = shoot(x, y, f) if high and high > my: my = high return my f = dict([parse(c) for c in fopen(17).readline()[13:].split(', ')]) print(get_max_y(f))
def plots(phedexInfo, dbsInfo, classadsInfo, iformat): "Generate popularity plots from phedex/dbs/classads dicts" #file to dump dataset by dataset tallies fDump = fopen(os.path.join(outputDir, 'dumpIt.txt.gz'), 'w') keyInfos = {} for key in cbs_keys: #the plot has two attributes, the time period and the data sample ageKey = findAgeKey(key) spKey = spKeys[key] dsKey = (spKey[0]) keyInfos[key] = [ageKey, dsKey] #loop over datasets known to phedex for dataset in phedexInfo['All']: #get attributes from DBS - some data sets are missing - these #these tend to be test datasets dbsDataset = dbsInfo.get( dataset, None) #it is likely invalid data if its not here # here dataset is like /GluGluToHToZZTo4L_M-125_13TeV-powheg-pythia6/Phys14DR-PU20bx25_tsg_PHYS14_25_V1-v1/GEN-SIM-RAW # and dbsDataset is {'nfiles': 3199.0, 'nevents': 205484.0, 'size': 309328826257.0} if dbsDataset is not None: if 'creation_date' not in dbsDataset: continue ageDataset = dbsDataset['creation_date'].date() if 'dataset_size' in dbsDataset: sizeDataset = float(dbsDataset['dataset_size']) elif 'size' in dbsDataset: sizeDataset = float(dbsDataset['size']) else: ageDataset = None sizeDataset = None spDataset = dataset.split('/') # caching variables to avoid extensive loopups cacheES = {} for cat in dataCategories: cacheES[cat] = phedexInfo[cat].get(dataset, None) #loop over the set of plots to make for key, valinfo in keyInfos.iteritems(): #the plot has two attributes, the time period and the data sample ageKey = valinfo[0] #findAgeKey(key) dsKey = valinfo[1] #(spKey[0]) #stop the loop if the dataset is not part of the plot if not interestingDataset(key, spDataset): continue #loopup the average size of the dataset for this time period for cat in dataCategories: #range(3): m = 0 # if "All" in cacheES: # if dsKey in cacheES["All"]: # m=cacheES["All"][dsKey] #right if cat in cacheES: if dsKey in cacheES[cat]: m = cacheES[cat][dsKey] nCopies = 1. counter = 0. #optionally compute the number of copies (on average) of the dataset on disk #by comparing its average size to the size in dbs #again only works if dataset is known to dbs if divideByNCopies and (sizeDataset is not None): nCopies = m / sizeDataset #compute the average number of times each file in the dataset is accessed #using popularity data and copies on disk #divide but protect against case where the dataset is not on disk if nCopies > 0.: counter = (classadsInfo[dsKey].get(dataset, 0)) / nCopies else: counter = (classadsInfo[dsKey].get(dataset, 0)) #this ought to be 0 if counter > 0 and counter < 1: # any access to the dataset counts - so round up counter = 1 else: # otherwise round to the nearest integer value counter = round(counter) #distinguish the 0 bin between old and new datasets based on the #age of the dataset if counter == 0: if (ageDataset is not None) and (ageDataset < ageKey): counter = -1 # 0 old #cut off the plot at the desired value if counter > maxPop: counter = maxPop #store the results countsBySizeDict[cat][key][ counter] = countsBySizeDict[cat][key].get(counter, 0.) + m #test and printous if dataset == testDS: print "Pop counter for", key, cat, "is", counter if cat == "AllOps": fDump.write('%5d %7.5f %15s %s \n' % (counter, m, key, dataset)) fDump.close() #tally up all the information sumsDict = computeSums() #plot everything import plotter figNum = 0 for cat in dataCategories: #range(3): figNum = \ plotter.plotPopularity(sumsDict[cat], cbs_keys, popularitySource, cat+'_'+popularitySource, figNum, iformat)
def readSizes(): #replica level information phedexInfo = {} #dataset level information phedexDatasetInfo = {} #site,dataset,rdate,gid,min_date,max_date,ave_size,max_size,days colsPhedex = { "site": -1, "dataset": -1, "rdate": -1, "min_date": -1, "max_date": -1, "ave_size": -1, "max_size": -1, "days": -1, "gid": -1 } colPhedexNames = colsPhedex.keys() nCount = 0 print("### use phedexDataFile %s" % phedexDataFile) print("### testDS %s" % testDS) istream = fopen(phedexDataFile) for l in istream: nCount = nCount + 1 #optionaly test things on a subset of data if isTest and nCount > 10000: print "Incomplete data as you are just testing" break sp = l.strip().split(',') #use the first row to understand the set of columns #stop if the data is not in the expected format if nCount == 1: for col in colPhedexNames: for i in range(0, len(sp)): if col == sp[i]: colsPhedex[col] = i if colsPhedex[col] == -1: print "missing column", col print("File: %s" % phedexDataFile) sys.exit(1) # print("### colsPhedex", colsPhedex) else: #create the dictionaries from the phedex csvs dataset = sp[colsPhedex["dataset"]] site = sp[colsPhedex["site"]] rdate = sp[colsPhedex["rdate"]] gid = sp[colsPhedex["gid"]] #skip anything that is relval if 'RelVal' in dataset: continue key = (dataset, site, rdate, gid) #should become try: blah except: blah if dataset not in phedexDatasetInfo: phedexDatasetInfo[dataset] = [] #this can then be used to look up detailed information in phedexInfo dictionary phedexDatasetInfo[dataset].append((site, rdate, gid)) datum = {} for col in colPhedexNames: if col == "site": continue if col == "dataset": continue datum[col] = sp[colsPhedex[col]] #catch errors - there should never be a repeated key if key in phedexInfo: print "Duplicated key" print key print sp print phedexInfo[key] sys.exit(1) #done, just store everything.. phedexInfo[key] = datum if testDS in key: print("### testDS", key, datum) istream.close() replicas = phedexInfo.keys() nRep = len(replicas) #now make dataset level arrays that contain day-by-day size on T1/T2 disk #do that for analysis ops and comp ops and gid=-1 (which is a nonsense value) esDictKeys = ["All", "AnaOps", "AllOps", "MinusOne"] effectiveSizesDict = {} effectiveSizesFunc = {} for key in esDictKeys: effectiveSizesDict[key] = {} method = "is" + key effectiveSizesFunc[key] = globals()[method] print("phedexDatasetInfo", len(phedexDatasetInfo.keys()), "size", object_size(phedexDatasetInfo)) #loop over dataset and replicas for dataset, keyInfos in phedexDatasetInfo.iteritems(): #again, skip relvals here - even if there should be none if "/RelVal" in dataset: continue #create the arrays cacheES = {k: numpy.zeros(nDays) for k in esDictKeys} for key, val in cacheES.iteritems(): effectiveSizesDict[key][dataset] = val #get the list replicas for this dataset #keyInfos=phedexDatasetInfo[dataset] #loop over them for keyInfo in keyInfos: site = keyInfo[0] #skip things that are not T1 or T2 if not site.startswith("T1") and not site.startswith("T2"): continue if not use_only_tier2 and not site.startswith("T2"): continue #get the detailed phedex information for this replica phKey = (dataset, ) + keyInfo phDatum = phedexInfo[phKey] d1 = getDate(phDatum["min_date"]) d2 = getDate(phDatum["max_date"]) #compute the range of days that this replica was on disk indEnd = (d2 - dStartOldest).days if d2 < dEnd else nDays - 1 if indEnd < 0: continue #sample was gone before the period we are looking at indStart = (d1 - dStartOldest).days if d1 > dStartOldest else 0 #just some printouts for debugging if you want them if dataset == testDS: print site, phKey, phDatum print d1, d2 print "start and end", indStart, indEnd print float(phDatum['ave_size']) #set the daily size to the average seen in the phedex dumps for key, val in effectiveSizesFunc.iteritems(): if val(keyInfo): cacheES[key][indStart:indEnd + 1] += float( phDatum['ave_size']) return effectiveSizesDict
def solve(n): img = Image(*fopen(20).read().split('\n\n')) img.tr(n) img.count()