def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) listrses = map(lambda x: x['rse'], listrses) account = 'root' pdf = generatePDF() # Generate 200000 datasets according to the dataset distribution for index in range(20000): scope_nb = getRandomScope(pdf) project = 'user.user%i' % (scope_nb) scope = 'user.user%i' % (scope_nb) account = 'user%i' % (scope_nb) print(scope) nbfiles = 53 filesize = 78000000 uid = uuid() dsn = '%s.%s' % (project, uid) rnd_site = random.choice(listrses) print('%i Creating %s with %i files of size %i located at %s' % (index, dsn, nbfiles, filesize, rnd_site)) add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in range(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) add_file_replica(rnd_site, scope, file, filesize, issuer=account) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) try: add_replication_rule(dids=[{ 'scope': scope, 'name': dsn }], account=account, copies=1, rse_expression=rnd_site, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer=account) monitor.record(timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule as e: print(e)
def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) listrses = map(lambda x: x['rse'], listrses) account = 'root' pdf = generatePDF() # Generate 200000 datasets according to the dataset distribution for index in xrange(0, 20000): scope_nb = getRandomScope(pdf) project = 'user.user%i' % (scope_nb) scope = 'user.user%i' % (scope_nb) account = 'user%i' % (scope_nb) print scope nbfiles = 53 filesize = 78000000 uid = uuid() dsn = '%s.%s' % (project, uid) rnd_site = random.choice(listrses) print '%i Creating %s with %i files of size %i located at %s' % (index, dsn, nbfiles, filesize, rnd_site) add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in xrange(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) add_file_replica(rnd_site, scope, file, filesize, issuer=account) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) try: add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=rnd_site, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer=account) monitor.record(timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule, e: print e
def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) print listrses listrses = map(lambda x: x['rse'], listrses) account = 'root' nbDatasets = 0 list = [] dictDistrib = {} if not filename: if os.getenv('RUCIO_HOME'): filename = os.getenv( 'RUCIO_HOME') + '/etc/data12_8TeV_distribution.txt' else: filename = '/opt/rucio/etc/data12_8TeV_distribution.txt' # Get the dataset distribution f = open(filename, 'r') for line in f: if not line.startswith('NBDATASETS'): line = line.rstrip('\n') strsplit = line.split() dictDistrib[(nbDatasets, nbDatasets + int(strsplit[0]))] = strsplit[1:] nbDatasets += int(strsplit[0]) list.append([ nbDatasets, ] + strsplit[1:]) f.close() # Generate 200000 datasets according to the dataset distribution for i in xrange(0, 200000): rnd = random.random() * nbDatasets for lower, upper in dictDistrib: if (rnd > lower) and (rnd < upper): project = dictDistrib[lower, upper][0] scope = project run_number = random.randint(0, 1000000) tag = random.randint(0, 10000) stream_name = dictDistrib[lower, upper][1] prod_step = dictDistrib[lower, upper][2] datatype = dictDistrib[lower, upper][3] provenance = dictDistrib[lower, upper][4] group = dictDistrib[lower, upper][5] if group == '/atlas/role=production': # account = 'atlasprod' account = 'panda' if provenance == 'T0': group = 'tier0' account = 'tier0' else: group = 'panda' else: # account = dictGroups[group] account = 'panda' scope = 'group.%s' % (dictGroups[group]) group = dictGroups[group] nbfiles = int(dictDistrib[lower, upper][6]) filesize = int( int(dictDistrib[lower, upper][7]) / float(nbfiles)) nbreplicas = int(dictDistrib[lower, upper][8]) if group == 'panda' or group == 'tier0': dataset_meta = { 'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group } else: campaign = int(tag / 1000.) dataset_meta = { 'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group, 'campaign': '%s_repro_%i' % (group, campaign) } source_rses = [] if nbreplicas: iter = 0 while (len(source_rses) != nbreplicas and iter != 100): rnd_site = random.choice(listrses) iter += 1 if (rnd_site not in source_rses): source_rses.append(rnd_site) run_number_string = str(run_number) run_number_string = run_number_string.rjust(7, '0') dsn = '%s.%s.%s.%s.%s.%s' % (project, run_number_string, stream_name, prod_step, datatype, tag) print '%i Creating %s:%s with %i files of size %i located at %i sites' % ( i, scope, dsn, nbfiles, filesize, len(source_rses)) stime1 = time.time() add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta) stime2 = time.time() print 'Time to generate a dataset : %s' % str(stime2 - stime1) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in xrange(nbfiles)] listfiles = [] for file in files: listfiles.append({ 'scope': scope, 'name': file, 'size': filesize }) for source_rse in source_rses: add_file_replica(source_rse, scope, file, filesize, issuer=account) stime3 = time.time() print 'Time to create replicas : %s' % str(stime3 - stime2) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles * len(source_rses)) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) stime4 = time.time() print 'Time to attach files : %s' % str(stime4 - stime3) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) for source_rse in source_rses: try: add_replication_rule(dids=[{ 'scope': scope, 'name': dsn }], account=account, copies=1, rse_expression=source_rse, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root') monitor.record( timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule, e: print e stime5 = time.time() print 'Time to attach files : %s' % str(stime5 - stime4)
def populateDB(filename=None): listrses = list_rses(filters={'deterministic': 1}) print listrses listrses = map(lambda x: x['rse'], listrses) account = 'root' nbDatasets = 0 list = [] dictDistrib = {} if not filename: if os.getenv('RUCIO_HOME'): filename = os.getenv('RUCIO_HOME') + '/etc/data12_8TeV_distribution.txt' else: filename = '/opt/rucio/etc/data12_8TeV_distribution.txt' # Get the dataset distribution f = open(filename, 'r') for line in f: if not line.startswith('NBDATASETS'): line = line.rstrip('\n') strsplit = line.split() dictDistrib[(nbDatasets, nbDatasets + int(strsplit[0]))] = strsplit[1:] nbDatasets += int(strsplit[0]) list.append([nbDatasets, ] + strsplit[1:]) f.close() # Generate 200000 datasets according to the dataset distribution for i in xrange(0, 200000): rnd = random.random() * nbDatasets for lower, upper in dictDistrib: if (rnd > lower) and (rnd < upper): project = dictDistrib[lower, upper][0] scope = project run_number = random.randint(0, 1000000) tag = random.randint(0, 10000) stream_name = dictDistrib[lower, upper][1] prod_step = dictDistrib[lower, upper][2] datatype = dictDistrib[lower, upper][3] provenance = dictDistrib[lower, upper][4] group = dictDistrib[lower, upper][5] if group == '/atlas/role=production': # account = 'atlasprod' account = 'panda' if provenance == 'T0': group = 'tier0' account = 'tier0' else: group = 'panda' else: # account = dictGroups[group] account = 'panda' scope = 'group.%s' % (dictGroups[group]) group = dictGroups[group] nbfiles = int(dictDistrib[lower, upper][6]) filesize = int(int(dictDistrib[lower, upper][7])/float(nbfiles)) nbreplicas = int(dictDistrib[lower, upper][8]) if group == 'panda' or group == 'tier0': dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group} else: campaign = int(tag/1000.) dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group, 'campaign': '%s_repro_%i' % (group, campaign)} source_rses = [] if nbreplicas: iter = 0 while (len(source_rses) != nbreplicas and iter != 100): rnd_site = random.choice(listrses) iter += 1 if (rnd_site not in source_rses): source_rses.append(rnd_site) run_number_string = str(run_number) run_number_string = run_number_string.rjust(7, '0') dsn = '%s.%s.%s.%s.%s.%s' % (project, run_number_string, stream_name, prod_step, datatype, tag) print '%i Creating %s:%s with %i files of size %i located at %i sites' % (i, scope, dsn, nbfiles, filesize, len(source_rses)) stime1 = time.time() add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta) stime2 = time.time() print 'Time to generate a dataset : %s' % str(stime2 - stime1) monitor.record(timeseries='dbfiller.addnewdataset', delta=1) files = ['file_%s' % uuid() for i in xrange(nbfiles)] listfiles = [] for file in files: listfiles.append({'scope': scope, 'name': file, 'size': filesize}) for source_rse in source_rses: add_file_replica(source_rse, scope, file, filesize, issuer=account) stime3 = time.time() print 'Time to create replicas : %s' % str(stime3 - stime2) monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles*len(source_rses)) attach_identifier(scope, name=dsn, dids=listfiles, issuer=account) stime4 = time.time() print 'Time to attach files : %s' % str(stime4 - stime3) monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles) for source_rse in source_rses: try: add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root') monitor.record(timeseries='dbfiller.addreplicationrules', delta=1) except InvalidReplicationRule, e: print e stime5 = time.time() print 'Time to attach files : %s' % str(stime5 - stime4)