def populateDB(): listrses = list_rses({'T1': '1'}) print len(listrses), listrses # listrses = list_rses() # print len(listrses), listrses # sys.exit() account = 'root' project = 'mc12_8TeV' dictDistrib = [{'datatype': 'HITS', 'prodstep': 'merge', 'nbfiles': 302, 'totfilesize': 225394185112, 'nbreplicas': 1}, {'datatype': 'HITS', 'prodstep': 'simul', 'nbfiles': 620, 'totfilesize': 97930909866, 'nbreplicas': 1}, {'datatype': 'EVNT', 'prodstep': 'evgen', 'nbfiles': 324, 'totfilesize': 7809298802, 'nbreplicas': 3}, {'datatype': 'AOD', 'prodstep': 'merge', 'nbfiles': 52, 'totfilesize': 106942334943, 'nbreplicas': 4}, {'datatype': 'AOD', 'prodstep': 'recon', 'nbfiles': 858, 'totfilesize': 182186965627, 'nbreplicas': 1}] for d in dictDistrib: for day in xrange(0, 180): for i in xrange(0, 30): scope = project prod_step = d['prodstep'] datatype = d['datatype'] nbfiles = int(d['nbfiles']) filesize = int(int(d['totfilesize'])/float(nbfiles)) nbfiles = int(random.gauss(nbfiles, nbfiles/10)) filesize = int(random.gauss(filesize, filesize/10)) nbreplicas = int(d['nbreplicas']) dataset_meta = {'project': project, 'stream_name': 'dummy', 'prod_step': prod_step, 'datatype': datatype} source_rses = [] if nbreplicas: iter = 0 while (len(source_rses) != nbreplicas and iter != 100): rnd_site = random.choice(listrses) iter += 1 if rnd_site not in source_rses: source_rses.append(rnd_site) try: dsn = '%s.%s.%s.%i.%i' % (project, prod_step, datatype, day, i) print '%i Creating %s with %i files of size %i located at %i sites' % (i, dsn, nbfiles, filesize, len(source_rses)) add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta) files = ['file_%s' % uuid() for i in xrange(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) for source_rse in source_rses: add_file_replica(source_rse, scope, file, filesize, issuer=account) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) for source_rse in source_rses: try: add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root') except InvalidReplicationRule, e: print e except RucioException, e: print e
def populateDB(): listrses = list_rses({'T1': '1'}) print(len(listrses), listrses) # listrses = list_rses() # print len(listrses), listrses # sys.exit() account = 'root' project = 'mc12_8TeV' dictDistrib = [{'datatype': 'HITS', 'prodstep': 'merge', 'nbfiles': 302, 'totfilesize': 225394185112, 'nbreplicas': 1}, {'datatype': 'HITS', 'prodstep': 'simul', 'nbfiles': 620, 'totfilesize': 97930909866, 'nbreplicas': 1}, {'datatype': 'EVNT', 'prodstep': 'evgen', 'nbfiles': 324, 'totfilesize': 7809298802, 'nbreplicas': 3}, {'datatype': 'AOD', 'prodstep': 'merge', 'nbfiles': 52, 'totfilesize': 106942334943, 'nbreplicas': 4}, {'datatype': 'AOD', 'prodstep': 'recon', 'nbfiles': 858, 'totfilesize': 182186965627, 'nbreplicas': 1}] for d in dictDistrib: for day in range(180): for i in range(30): scope = project prod_step = d['prodstep'] datatype = d['datatype'] nbfiles = int(d['nbfiles']) filesize = int(int(d['totfilesize']) / float(nbfiles)) nbfiles = int(random.gauss(nbfiles, nbfiles / 10)) filesize = int(random.gauss(filesize, filesize / 10)) nbreplicas = int(d['nbreplicas']) dataset_meta = {'project': project, 'stream_name': 'dummy', 'prod_step': prod_step, 'datatype': datatype} source_rses = [] if nbreplicas: iter = 0 while (len(source_rses) != nbreplicas and iter != 100): rnd_site = random.choice(listrses) iter += 1 if rnd_site not in source_rses: source_rses.append(rnd_site) try: dsn = '%s.%s.%s.%i.%i' % (project, prod_step, datatype, day, i) print('%i Creating %s with %i files of size %i located at %i sites' % (i, dsn, nbfiles, filesize, len(source_rses))) add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta) files = ['file_%s' % uuid() for i in range(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) for source_rse in source_rses: add_file_replica(source_rse, scope, file, filesize, issuer=account) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) for source_rse in source_rses: try: add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root') except InvalidReplicationRule as e: print(e) except RucioException as e: print(e)
def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) listrses = map(lambda x: x['rse'], listrses) account = 'root' pdf = generatePDF() # Generate 200000 datasets according to the dataset distribution for index in range(20000): scope_nb = getRandomScope(pdf) project = 'user.user%i' % (scope_nb) scope = 'user.user%i' % (scope_nb) account = 'user%i' % (scope_nb) print(scope) nbfiles = 53 filesize = 78000000 uid = uuid() dsn = '%s.%s' % (project, uid) rnd_site = random.choice(listrses) print('%i Creating %s with %i files of size %i located at %s' % (index, dsn, nbfiles, filesize, rnd_site)) add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in range(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) add_file_replica(rnd_site, scope, file, filesize, issuer=account) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) try: add_replication_rule(dids=[{ 'scope': scope, 'name': dsn }], account=account, copies=1, rse_expression=rnd_site, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer=account) monitor.record(timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule as e: print(e)
def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) listrses = map(lambda x: x['rse'], listrses) account = 'root' pdf = generatePDF() # Generate 200000 datasets according to the dataset distribution for index in xrange(0, 20000): scope_nb = getRandomScope(pdf) project = 'user.user%i' % (scope_nb) scope = 'user.user%i' % (scope_nb) account = 'user%i' % (scope_nb) print scope nbfiles = 53 filesize = 78000000 uid = uuid() dsn = '%s.%s' % (project, uid) rnd_site = random.choice(listrses) print '%i Creating %s with %i files of size %i located at %s' % (index, dsn, nbfiles, filesize, rnd_site) add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in xrange(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) add_file_replica(rnd_site, scope, file, filesize, issuer=account) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) try: add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=rnd_site, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer=account) monitor.record(timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule, e: print e
def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) print listrses listrses = map(lambda x: x['rse'], listrses) account = 'root' nbDatasets = 0 list = [] dictDistrib = {} if not filename: if os.getenv('RUCIO_HOME'): filename = os.getenv( 'RUCIO_HOME') + '/etc/data12_8TeV_distribution.txt' else: filename = '/opt/rucio/etc/data12_8TeV_distribution.txt' # Get the dataset distribution f = open(filename, 'r') for line in f: if not line.startswith('NBDATASETS'): line = line.rstrip('\n') strsplit = line.split() dictDistrib[(nbDatasets, nbDatasets + int(strsplit[0]))] = strsplit[1:] nbDatasets += int(strsplit[0]) list.append([ nbDatasets, ] + strsplit[1:]) f.close() # Generate 200000 datasets according to the dataset distribution for i in xrange(0, 200000): rnd = random.random() * nbDatasets for lower, upper in dictDistrib: if (rnd > lower) and (rnd < upper): project = dictDistrib[lower, upper][0] scope = project run_number = random.randint(0, 1000000) tag = random.randint(0, 10000) stream_name = dictDistrib[lower, upper][1] prod_step = dictDistrib[lower, upper][2] datatype = dictDistrib[lower, upper][3] provenance = dictDistrib[lower, upper][4] group = dictDistrib[lower, upper][5] if group == '/atlas/role=production': # account = 'atlasprod' account = 'panda' if provenance == 'T0': group = 'tier0' account = 'tier0' else: group = 'panda' else: # account = dictGroups[group] account = 'panda' scope = 'group.%s' % (dictGroups[group]) group = dictGroups[group] nbfiles = int(dictDistrib[lower, upper][6]) filesize = int( int(dictDistrib[lower, upper][7]) / float(nbfiles)) nbreplicas = int(dictDistrib[lower, upper][8]) if group == 'panda' or group == 'tier0': dataset_meta = { 'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group } else: campaign = int(tag / 1000.) dataset_meta = { 'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group, 'campaign': '%s_repro_%i' % (group, campaign) } source_rses = [] if nbreplicas: iter = 0 while (len(source_rses) != nbreplicas and iter != 100): rnd_site = random.choice(listrses) iter += 1 if (rnd_site not in source_rses): source_rses.append(rnd_site) run_number_string = str(run_number) run_number_string = run_number_string.rjust(7, '0') dsn = '%s.%s.%s.%s.%s.%s' % (project, run_number_string, stream_name, prod_step, datatype, tag) print '%i Creating %s:%s with %i files of size %i located at %i sites' % ( i, scope, dsn, nbfiles, filesize, len(source_rses)) stime1 = time.time() add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta) stime2 = time.time() print 'Time to generate a dataset : %s' % str(stime2 - stime1) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in xrange(nbfiles)] listfiles = [] for file in files: listfiles.append({ 'scope': scope, 'name': file, 'size': filesize }) for source_rse in source_rses: add_file_replica(source_rse, scope, file, filesize, issuer=account) stime3 = time.time() print 'Time to create replicas : %s' % str(stime3 - stime2) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles * len(source_rses)) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) stime4 = time.time() print 'Time to attach files : %s' % str(stime4 - stime3) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) for source_rse in source_rses: try: add_replication_rule(dids=[{ 'scope': scope, 'name': dsn }], account=account, copies=1, rse_expression=source_rse, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root') monitor.record( timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule, e: print e stime5 = time.time() print 'Time to attach files : %s' % str(stime5 - stime4)
def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) print listrses listrses = map(lambda x: x['rse'], listrses) account = 'root' nbDatasets = 0 list = [] dictDistrib = {} if not filename: if os.getenv('RUCIO_HOME'): filename = os.getenv('RUCIO_HOME') + '/etc/data12_8TeV_distribution.txt' else: filename = '/opt/rucio/etc/data12_8TeV_distribution.txt' # Get the dataset distribution f = open(filename, 'r') for line in f: if not line.startswith('NBDATASETS'): line = line.rstrip('\n') strsplit = line.split() dictDistrib[(nbDatasets, nbDatasets + int(strsplit[0]))] = strsplit[1:] nbDatasets += int(strsplit[0]) list.append([nbDatasets, ] + strsplit[1:]) f.close() # Generate 200000 datasets according to the dataset distribution for i in xrange(0, 200000): rnd = random.random() * nbDatasets for lower, upper in dictDistrib: if (rnd > lower) and (rnd < upper): project = dictDistrib[lower, upper][0] scope = project run_number = random.randint(0, 1000000) tag = random.randint(0, 10000) stream_name = dictDistrib[lower, upper][1] prod_step = dictDistrib[lower, upper][2] datatype = dictDistrib[lower, upper][3] provenance = dictDistrib[lower, upper][4] group = dictDistrib[lower, upper][5] if group == '/atlas/role=production': # account = 'atlasprod' account = 'panda' if provenance == 'T0': group = 'tier0' account = 'tier0' else: group = 'panda' else: # account = dictGroups[group] account = 'panda' scope = 'group.%s' % (dictGroups[group]) group = dictGroups[group] nbfiles = int(dictDistrib[lower, upper][6]) filesize = int(int(dictDistrib[lower, upper][7])/float(nbfiles)) nbreplicas = int(dictDistrib[lower, upper][8]) if group == 'panda' or group == 'tier0': dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group} else: campaign = int(tag/1000.) dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group, 'campaign': '%s_repro_%i' % (group, campaign)} source_rses = [] if nbreplicas: iter = 0 while (len(source_rses) != nbreplicas and iter != 100): rnd_site = random.choice(listrses) iter += 1 if (rnd_site not in source_rses): source_rses.append(rnd_site) run_number_string = str(run_number) run_number_string = run_number_string.rjust(7, '0') dsn = '%s.%s.%s.%s.%s.%s' % (project, run_number_string, stream_name, prod_step, datatype, tag) print '%i Creating %s:%s with %i files of size %i located at %i sites' % (i, scope, dsn, nbfiles, filesize, len(source_rses)) stime1 = time.time() add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta) stime2 = time.time() print 'Time to generate a dataset : %s' % str(stime2 - stime1) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in xrange(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) for source_rse in source_rses: add_file_replica(source_rse, scope, file, filesize, issuer=account) stime3 = time.time() print 'Time to create replicas : %s' % str(stime3 - stime2) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles*len(source_rses)) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) stime4 = time.time() print 'Time to attach files : %s' % str(stime4 - stime3) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) for source_rse in source_rses: try: add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root') monitor.record(timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule, e: print e stime5 = time.time() print 'Time to attach files : %s' % str(stime5 - stime4)