def populateDB():
    listrses = list_rses({'T1': '1'})
    print len(listrses), listrses
    # listrses = list_rses()
    # print len(listrses), listrses
    # sys.exit()
    account = 'root'
    project = 'mc12_8TeV'

    dictDistrib = [{'datatype': 'HITS', 'prodstep': 'merge', 'nbfiles': 302, 'totfilesize': 225394185112, 'nbreplicas': 1}, {'datatype': 'HITS', 'prodstep': 'simul', 'nbfiles': 620, 'totfilesize': 97930909866, 'nbreplicas': 1},
                   {'datatype': 'EVNT', 'prodstep': 'evgen', 'nbfiles': 324, 'totfilesize': 7809298802, 'nbreplicas': 3}, {'datatype': 'AOD', 'prodstep': 'merge', 'nbfiles': 52, 'totfilesize': 106942334943, 'nbreplicas': 4},
                   {'datatype': 'AOD', 'prodstep': 'recon', 'nbfiles': 858, 'totfilesize': 182186965627, 'nbreplicas': 1}]

    for d in dictDistrib:
        for day in xrange(0, 180):
            for i in xrange(0, 30):
                scope = project
                prod_step = d['prodstep']
                datatype = d['datatype']
                nbfiles = int(d['nbfiles'])
                filesize = int(int(d['totfilesize'])/float(nbfiles))
                nbfiles = int(random.gauss(nbfiles, nbfiles/10))
                filesize = int(random.gauss(filesize, filesize/10))
                nbreplicas = int(d['nbreplicas'])
                dataset_meta = {'project': project, 'stream_name': 'dummy', 'prod_step': prod_step, 'datatype': datatype}
                source_rses = []
                if nbreplicas:
                    iter = 0
                    while (len(source_rses) != nbreplicas and iter != 100):
                        rnd_site = random.choice(listrses)
                        iter += 1
                        if rnd_site not in source_rses:
                            source_rses.append(rnd_site)

                    try:
                        dsn = '%s.%s.%s.%i.%i' % (project, prod_step, datatype, day, i)
                        print '%i Creating %s with %i files of size %i located at %i sites' % (i, dsn, nbfiles, filesize, len(source_rses))
                        add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta)
                        files = ['file_%s' % uuid() for i in xrange(nbfiles)]
                        listfiles = []
                        for file in files:
                            listfiles.append({'scope': scope, 'name': file, 'size': filesize})
                            for source_rse in source_rses:
                                add_file_replica(source_rse, scope, file, filesize, issuer=account)
                        attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
                        for source_rse in source_rses:
                            try:
                                add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse,
                                                     grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root')
                            except InvalidReplicationRule, e:
                                print e
                    except RucioException, e:
                        print e
def populateDB():
    listrses = list_rses({'T1': '1'})
    print(len(listrses), listrses)
    # listrses = list_rses()
    # print len(listrses), listrses
    # sys.exit()
    account = 'root'
    project = 'mc12_8TeV'

    dictDistrib = [{'datatype': 'HITS', 'prodstep': 'merge', 'nbfiles': 302, 'totfilesize': 225394185112, 'nbreplicas': 1}, {'datatype': 'HITS', 'prodstep': 'simul', 'nbfiles': 620, 'totfilesize': 97930909866, 'nbreplicas': 1},
                   {'datatype': 'EVNT', 'prodstep': 'evgen', 'nbfiles': 324, 'totfilesize': 7809298802, 'nbreplicas': 3}, {'datatype': 'AOD', 'prodstep': 'merge', 'nbfiles': 52, 'totfilesize': 106942334943, 'nbreplicas': 4},
                   {'datatype': 'AOD', 'prodstep': 'recon', 'nbfiles': 858, 'totfilesize': 182186965627, 'nbreplicas': 1}]

    for d in dictDistrib:
        for day in range(180):
            for i in range(30):
                scope = project
                prod_step = d['prodstep']
                datatype = d['datatype']
                nbfiles = int(d['nbfiles'])
                filesize = int(int(d['totfilesize']) / float(nbfiles))
                nbfiles = int(random.gauss(nbfiles, nbfiles / 10))
                filesize = int(random.gauss(filesize, filesize / 10))
                nbreplicas = int(d['nbreplicas'])
                dataset_meta = {'project': project, 'stream_name': 'dummy', 'prod_step': prod_step, 'datatype': datatype}
                source_rses = []
                if nbreplicas:
                    iter = 0
                    while (len(source_rses) != nbreplicas and iter != 100):
                        rnd_site = random.choice(listrses)
                        iter += 1
                        if rnd_site not in source_rses:
                            source_rses.append(rnd_site)

                    try:
                        dsn = '%s.%s.%s.%i.%i' % (project, prod_step, datatype, day, i)
                        print('%i Creating %s with %i files of size %i located at %i sites' % (i, dsn, nbfiles, filesize, len(source_rses)))
                        add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta)
                        files = ['file_%s' % uuid() for i in range(nbfiles)]
                        listfiles = []
                        for file in files:
                            listfiles.append({'scope': scope, 'name': file, 'size': filesize})
                            for source_rse in source_rses:
                                add_file_replica(source_rse, scope, file, filesize, issuer=account)
                        attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
                        for source_rse in source_rses:
                            try:
                                add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse,
                                                     grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root')
                            except InvalidReplicationRule as e:
                                print(e)
                    except RucioException as e:
                        print(e)
Exemple #3
0
def populateDB(filename=None):
    listrses = list_rses(filters={'deterministic': 1})
    listrses = map(lambda x: x['rse'], listrses)
    account = 'root'

    pdf = generatePDF()

    # Generate 200000 datasets according to the dataset distribution
    for index in range(20000):
        scope_nb = getRandomScope(pdf)
        project = 'user.user%i' % (scope_nb)
        scope = 'user.user%i' % (scope_nb)
        account = 'user%i' % (scope_nb)
        print(scope)
        nbfiles = 53
        filesize = 78000000
        uid = uuid()
        dsn = '%s.%s' % (project, uid)
        rnd_site = random.choice(listrses)
        print('%i Creating %s with %i files of size %i located at %s' %
              (index, dsn, nbfiles, filesize, rnd_site))
        add_identifier(scope=scope,
                       name=dsn,
                       type='dataset',
                       issuer=account,
                       statuses={'monotonic': True})
        monitor.record(timeseries='dbfiller.addnewdataset', delta=1)
        files = ['file_%s' % uuid() for i in range(nbfiles)]
        listfiles = []
        for file in files:
            listfiles.append({'scope': scope, 'name': file, 'size': filesize})
            add_file_replica(rnd_site, scope, file, filesize, issuer=account)
        monitor.record(timeseries='dbfiller.addreplicas', delta=nbfiles)
        attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
        monitor.record(timeseries='dbfiller.addnewfile', delta=nbfiles)
        try:
            add_replication_rule(dids=[{
                'scope': scope,
                'name': dsn
            }],
                                 account=account,
                                 copies=1,
                                 rse_expression=rnd_site,
                                 grouping='DATASET',
                                 weight=None,
                                 lifetime=None,
                                 locked=False,
                                 subscription_id=None,
                                 issuer=account)
            monitor.record(timeseries='dbfiller.addreplicationrules', delta=1)
        except InvalidReplicationRule as e:
            print(e)
def populateDB(filename=None):
    listrses = list_rses(filters={'deterministic': 1})
    listrses = map(lambda x: x['rse'], listrses)
    account = 'root'

    pdf = generatePDF()

    # Generate 200000 datasets according to the dataset distribution
    for index in xrange(0, 20000):
        scope_nb = getRandomScope(pdf)
        project = 'user.user%i' % (scope_nb)
        scope = 'user.user%i' % (scope_nb)
        account = 'user%i' % (scope_nb)
        print scope
        nbfiles = 53
        filesize = 78000000
        uid = uuid()
        dsn = '%s.%s' % (project, uid)
        rnd_site = random.choice(listrses)
        print '%i Creating %s with %i files of size %i located at %s' % (index, dsn, nbfiles, filesize, rnd_site)
        add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True})
        monitor.record(timeseries='dbfiller.addnewdataset',  delta=1)
        files = ['file_%s' % uuid() for i in xrange(nbfiles)]
        listfiles = []
        for file in files:
            listfiles.append({'scope': scope, 'name': file, 'size': filesize})
            add_file_replica(rnd_site, scope, file, filesize, issuer=account)
        monitor.record(timeseries='dbfiller.addreplicas',  delta=nbfiles)
        attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
        monitor.record(timeseries='dbfiller.addnewfile',  delta=nbfiles)
        try:
            add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=rnd_site,
                                 grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer=account)
            monitor.record(timeseries='dbfiller.addreplicationrules',  delta=1)
        except InvalidReplicationRule, e:
            print e
def populateDB(filename=None):
    listrses = list_rses(filters={'deterministic': 1})
    print listrses
    listrses = map(lambda x: x['rse'], listrses)
    account = 'root'
    nbDatasets = 0
    list = []
    dictDistrib = {}

    if not filename:
        if os.getenv('RUCIO_HOME'):
            filename = os.getenv(
                'RUCIO_HOME') + '/etc/data12_8TeV_distribution.txt'
        else:
            filename = '/opt/rucio/etc/data12_8TeV_distribution.txt'

    # Get the dataset distribution
    f = open(filename, 'r')
    for line in f:
        if not line.startswith('NBDATASETS'):
            line = line.rstrip('\n')
            strsplit = line.split()
            dictDistrib[(nbDatasets,
                         nbDatasets + int(strsplit[0]))] = strsplit[1:]
            nbDatasets += int(strsplit[0])
            list.append([
                nbDatasets,
            ] + strsplit[1:])
    f.close()

    # Generate 200000 datasets according to the dataset distribution
    for i in xrange(0, 200000):
        rnd = random.random() * nbDatasets
        for lower, upper in dictDistrib:
            if (rnd > lower) and (rnd < upper):
                project = dictDistrib[lower, upper][0]
                scope = project
                run_number = random.randint(0, 1000000)
                tag = random.randint(0, 10000)
                stream_name = dictDistrib[lower, upper][1]
                prod_step = dictDistrib[lower, upper][2]
                datatype = dictDistrib[lower, upper][3]
                provenance = dictDistrib[lower, upper][4]
                group = dictDistrib[lower, upper][5]
                if group == '/atlas/role=production':
                    # account = 'atlasprod'
                    account = 'panda'
                    if provenance == 'T0':
                        group = 'tier0'
                        account = 'tier0'
                    else:
                        group = 'panda'
                else:
                    # account = dictGroups[group]
                    account = 'panda'
                    scope = 'group.%s' % (dictGroups[group])
                    group = dictGroups[group]
                nbfiles = int(dictDistrib[lower, upper][6])
                filesize = int(
                    int(dictDistrib[lower, upper][7]) / float(nbfiles))
                nbreplicas = int(dictDistrib[lower, upper][8])
                if group == 'panda' or group == 'tier0':
                    dataset_meta = {
                        'project': project,
                        'run_number': run_number,
                        'stream_name': stream_name,
                        'prod_step': prod_step,
                        'datatype': datatype,
                        'provenance': provenance,
                        'group': group
                    }
                else:
                    campaign = int(tag / 1000.)
                    dataset_meta = {
                        'project': project,
                        'run_number': run_number,
                        'stream_name': stream_name,
                        'prod_step': prod_step,
                        'datatype': datatype,
                        'provenance': provenance,
                        'group': group,
                        'campaign': '%s_repro_%i' % (group, campaign)
                    }
                source_rses = []
                if nbreplicas:
                    iter = 0
                    while (len(source_rses) != nbreplicas and iter != 100):
                        rnd_site = random.choice(listrses)
                        iter += 1
                        if (rnd_site not in source_rses):
                            source_rses.append(rnd_site)

                    run_number_string = str(run_number)
                    run_number_string = run_number_string.rjust(7, '0')
                    dsn = '%s.%s.%s.%s.%s.%s' % (project, run_number_string,
                                                 stream_name, prod_step,
                                                 datatype, tag)
                    print '%i Creating %s:%s with %i files of size %i located at %i sites' % (
                        i, scope, dsn, nbfiles, filesize, len(source_rses))
                    stime1 = time.time()
                    add_identifier(scope=scope,
                                   name=dsn,
                                   type='dataset',
                                   issuer=account,
                                   statuses={'monotonic': True},
                                   meta=dataset_meta)
                    stime2 = time.time()
                    print 'Time to generate a dataset : %s' % str(stime2 -
                                                                  stime1)
                    monitor.record(timeseries='dbfiller.addnewdataset',
                                   delta=1)
                    files = ['file_%s' % uuid() for i in xrange(nbfiles)]
                    listfiles = []
                    for file in files:
                        listfiles.append({
                            'scope': scope,
                            'name': file,
                            'size': filesize
                        })
                        for source_rse in source_rses:
                            add_file_replica(source_rse,
                                             scope,
                                             file,
                                             filesize,
                                             issuer=account)
                    stime3 = time.time()
                    print 'Time to create replicas : %s' % str(stime3 - stime2)
                    monitor.record(timeseries='dbfiller.addreplicas',
                                   delta=nbfiles * len(source_rses))
                    attach_identifier(scope,
                                      name=dsn,
                                      dids=listfiles,
                                      issuer=account)
                    stime4 = time.time()
                    print 'Time to attach files : %s' % str(stime4 - stime3)
                    monitor.record(timeseries='dbfiller.addnewfile',
                                   delta=nbfiles)
                    for source_rse in source_rses:
                        try:
                            add_replication_rule(dids=[{
                                'scope': scope,
                                'name': dsn
                            }],
                                                 account=account,
                                                 copies=1,
                                                 rse_expression=source_rse,
                                                 grouping='DATASET',
                                                 weight=None,
                                                 lifetime=None,
                                                 locked=False,
                                                 subscription_id=None,
                                                 issuer='root')
                            monitor.record(
                                timeseries='dbfiller.addreplicationrules',
                                delta=1)
                        except InvalidReplicationRule, e:
                            print e
                    stime5 = time.time()
                    print 'Time to attach files : %s' % str(stime5 - stime4)
def populateDB(filename=None):
    listrses = list_rses(filters={'deterministic': 1})
    print listrses
    listrses = map(lambda x: x['rse'], listrses)
    account = 'root'
    nbDatasets = 0
    list = []
    dictDistrib = {}

    if not filename:
        if os.getenv('RUCIO_HOME'):
            filename = os.getenv('RUCIO_HOME') + '/etc/data12_8TeV_distribution.txt'
        else:
            filename = '/opt/rucio/etc/data12_8TeV_distribution.txt'

    # Get the dataset distribution
    f = open(filename, 'r')
    for line in f:
        if not line.startswith('NBDATASETS'):
            line = line.rstrip('\n')
            strsplit = line.split()
            dictDistrib[(nbDatasets, nbDatasets + int(strsplit[0]))] = strsplit[1:]
            nbDatasets += int(strsplit[0])
            list.append([nbDatasets, ] + strsplit[1:])
    f.close()

    # Generate 200000 datasets according to the dataset distribution
    for i in xrange(0, 200000):
        rnd = random.random() * nbDatasets
        for lower, upper in dictDistrib:
            if (rnd > lower) and (rnd < upper):
                project = dictDistrib[lower, upper][0]
                scope = project
                run_number = random.randint(0, 1000000)
                tag = random.randint(0, 10000)
                stream_name = dictDistrib[lower, upper][1]
                prod_step = dictDistrib[lower, upper][2]
                datatype = dictDistrib[lower, upper][3]
                provenance = dictDistrib[lower, upper][4]
                group = dictDistrib[lower, upper][5]
                if group == '/atlas/role=production':
                    # account = 'atlasprod'
                    account = 'panda'
                    if provenance == 'T0':
                        group = 'tier0'
                        account = 'tier0'
                    else:
                        group = 'panda'
                else:
                    # account = dictGroups[group]
                    account = 'panda'
                    scope = 'group.%s' % (dictGroups[group])
                    group = dictGroups[group]
                nbfiles = int(dictDistrib[lower, upper][6])
                filesize = int(int(dictDistrib[lower, upper][7])/float(nbfiles))
                nbreplicas = int(dictDistrib[lower, upper][8])
                if group == 'panda' or group == 'tier0':
                    dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group}
                else:
                    campaign = int(tag/1000.)
                    dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group, 'campaign': '%s_repro_%i' % (group, campaign)}
                source_rses = []
                if nbreplicas:
                    iter = 0
                    while (len(source_rses) != nbreplicas and iter != 100):
                        rnd_site = random.choice(listrses)
                        iter += 1
                        if (rnd_site not in source_rses):
                            source_rses.append(rnd_site)

                    run_number_string = str(run_number)
                    run_number_string = run_number_string.rjust(7, '0')
                    dsn = '%s.%s.%s.%s.%s.%s' % (project, run_number_string, stream_name, prod_step, datatype, tag)
                    print '%i Creating %s:%s with %i files of size %i located at %i sites' % (i, scope, dsn, nbfiles, filesize, len(source_rses))
                    stime1 = time.time()
                    add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta)
                    stime2 = time.time()
                    print 'Time to generate a dataset : %s' % str(stime2 - stime1)
                    monitor.record(timeseries='dbfiller.addnewdataset',  delta=1)
                    files = ['file_%s' % uuid() for i in xrange(nbfiles)]
                    listfiles = []
                    for file in files:
                        listfiles.append({'scope': scope, 'name': file, 'size': filesize})
                        for source_rse in source_rses:
                            add_file_replica(source_rse, scope, file, filesize, issuer=account)
                    stime3 = time.time()
                    print 'Time to create replicas : %s' % str(stime3 - stime2)
                    monitor.record(timeseries='dbfiller.addreplicas',  delta=nbfiles*len(source_rses))
                    attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
                    stime4 = time.time()
                    print 'Time to attach files : %s' % str(stime4 - stime3)
                    monitor.record(timeseries='dbfiller.addnewfile',  delta=nbfiles)
                    for source_rse in source_rses:
                        try:
                            add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse,
                                                 grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root')
                            monitor.record(timeseries='dbfiller.addreplicationrules',  delta=1)
                        except InvalidReplicationRule, e:
                            print e
                    stime5 = time.time()
                    print 'Time to attach files : %s' % str(stime5 - stime4)