Ejemplo n.º 1
0
def run(metadata,timestamp_right_boundary=None):
    """
    Returns
        Number of enqueued items.
    """

    if metadata.count() < 1:
        return 0

    f=metadata.get_one_file()
    selection_filename=sdpostpipelineutils.get_attached_parameter__global([f],'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank
    selection_file=sdpostpipelineutils.get_attached_parameter__global([f],'selection_file') # note that if no files are found at all for this selection (no matter the status), then 'selection_file' will be blank
    

    metadata=sdsimplefilter.run(metadata,'status',sdconst.TRANSFER_STATUS_NEW,'keep')

    count=metadata.count() # how many files to be inserted
    total_size=metadata.size

    if count>0:

        sdlog.info("SDENQUEU-102","Add insertion_group_id..")

        insertion_group_id=sdsqlutils.nextval('insertion_group_id','history') # this is uniq identifier for all inserted files during this run
        po=sdpipelineprocessing.ProcessingObject(add_insertion_group_id,insertion_group_id)
        metadata=sdpipelineprocessing.run_pipeline(metadata,po)

        if sdconfig.progress:
            sdprogress.ProgressThread.start(sleep=0.1,running_message='',end_message='') # spinner start

        sdlog.info("SDENQUEU-103","Insert files and datasets..")

        po=sdpipelineprocessing.ProcessingObject(add_files)
        metadata=sdpipelineprocessing.run_pipeline(metadata,po)

        sdlog.info("SDENQUEU-104","Fill timestamp..")

        fix_timestamp()

        sddb.conn.commit() # final commit (we do all insertion/update in one transaction).

        if sdconfig.progress:
            sdprogress.ProgressThread.stop() # spinner stop

        histo_crea_date=sdtime.search_api_datetime_format_to_sqlite_datetime_format(timestamp_right_boundary) if timestamp_right_boundary is not None else None

        sdhistory.add_history_line(action=sdconst.ACTION_ADD,selection_file=selection_file,insertion_group_id=insertion_group_id,crea_date=histo_crea_date)

    sdlog.info("SDENQUEU-001","%i new file(s) added (total size=%i,selection=%s)"%(count,total_size,selection_filename))

    return count
Ejemplo n.º 2
0
def uniq(metadata):

    if metadata.count() < 1:
        return metadata

    # retrieve global flag
    f=metadata.get_one_file()
    keep_replica=sdpostpipelineutils.get_attached_parameter__global([f],'keep_replica')
    functional_id_keyname=sdpostpipelineutils.get_functional_identifier_name(f)

    if keep_replica=='true':
        # Keep replica.
        # In this case, we remove type-A duplicates, but we keep type-B duplicates (i.e. replicas)

        # uniq key => id (i.e. including datanode)

        sdlog.info("SSHRINKU-001","Remove duplicate..")

        metadata=sdrmdup.run(metadata,functional_id_keyname)
    else:
        # Do not keep replica.
        # In this case, we remove type-A and type-B duplicates by randomly keeping one candidate

        # uniq key => instance_id (i.e. excluding datanode)

        sdlog.info("SSHRINKU-002","Remove duplicate and replicate..")

        metadata=sdrmduprep.run(metadata,functional_id_keyname)

    return metadata
Ejemplo n.º 3
0
def run(metadata):
    """
    Set files status to "delete"

    Returns:
        Number of deleted items.

    Note
        - the func only change the status (i.e. data and metadata will be removed later by the daemon)
    """

    if metadata.count() < 1:
        return 0

    f=metadata.get_one_file()
    selection_filename=sdpostpipelineutils.get_attached_parameter__global([f],'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank

    # TODO: merge both to improve perf
    metadata=sdsimplefilter.run(metadata,'status',sdconst.TRANSFER_STATUS_NEW,'remove')
    metadata=sdsimplefilter.run(metadata,'status',sdconst.TRANSFER_STATUS_DELETE,'remove')

    count=metadata.count()

    if count>0:
        po=sdpipelineprocessing.ProcessingObject(delete)
        metadata=sdpipelineprocessing.run_pipeline(metadata,po)
        sddb.conn.commit() # final commit (we do all update in one transaction).

        sdhistorydao.add_history_line(sdconst.ACTION_DELETE,selection_filename)

        sdlog.info("SDDELETE-929","%i files marked for deletion (selection=%s)"%(count,selection_filename))

    return count
Ejemplo n.º 4
0
def transform_url(files):

    url_replace = sdpostpipelineutils.get_attached_parameter__global(
        files, 'url_replace')

    if url_replace is not None:
        (from_string, to_string) = parse_rule('url_replace', url_replace)
        for f in files:
            f['url'] = f['url'].replace(from_string, to_string)

    return files
Ejemplo n.º 5
0
def run(metadata):
    """
    Set files status to "delete"

    Returns:
        Number of deleted items.

    Note
        - the func only change the status (i.e. data and metadata will be removed later by the daemon)
    """

    if metadata.count() < 1:
        return 0

    f = metadata.get_one_file()
    selection_filename = sdpostpipelineutils.get_attached_parameter__global(
        [f], 'selection_filename'
    )  # note that if no files are found at all for this selection (no matter the status), then the filename will be blank

    # TODO: merge both to improve perf
    metadata = sdsimplefilter.run(metadata, 'status',
                                  sdconst.TRANSFER_STATUS_NEW, 'remove')
    metadata = sdsimplefilter.run(metadata, 'status',
                                  sdconst.TRANSFER_STATUS_DELETE, 'remove')

    count = metadata.count()

    if count > 0:
        po = sdpipelineprocessing.ProcessingObject(delete)
        metadata = sdpipelineprocessing.run_pipeline(metadata, po)
        sddb.conn.commit(
        )  # final commit (we do all update in one transaction).

        sdhistorydao.add_history_line(sdconst.ACTION_DELETE,
                                      selection_filename)

        sdlog.info(
            "SDDELETE-929", "%i files marked for deletion (selection=%s)" %
            (count, selection_filename))

    return count
Ejemplo n.º 6
0
def run(files):
    """
    Set files status to "delete"

    Note
        - the func only change the status (i.e. data and metadata will be removed later by the daemon)
    """
    selection_filename=sdpostpipelineutils.get_attached_parameter__global(files,'selection_filename')

    files=sdsimplefilter.run(files,'status',sdconst.TRANSFER_STATUS_NEW,'remove')
    files=sdsimplefilter.run(files,'status',sdconst.TRANSFER_STATUS_DELETE,'remove')

    count=len(files)

    if count>0:
        for file in files:
            sddeletefile.deferred_delete(file['file_functional_id'])

        sddao.add_history_line(sdconst.ACTION_DELETE,selection_filename)

        sdlog.info("SDDELETE-929","%i files marked for deletion (selection=%s)"%(count,selection_filename))

    return count
Ejemplo n.º 7
0
def run(files):
    if is_nearestpost_enabled(files):
        # In this case, we remove duplicates by keeping the nearest

        files=sdnearestpost.run(files)
    else:
        keep_replica=sdpostpipelineutils.get_attached_parameter__global(files,'keep_replica')
        if keep_replica=='true':
            # Keep replica.
            # In this case, we remove type-A duplicates, but we keep type-B duplicates (i.e. replicas)

            # uniq key => id (i.e. including datanode)

            files=sduniq.run(files,keep_replica=True)
        else:
            # Do not keep replica.
            # In this case, we remove type-A and type-B duplicates by randomly keeping one candidate
    
            # uniq key => instance_id (i.e. excluding datanode)

            files=sduniq.run(files)

    return files
Ejemplo n.º 8
0
def run(files):
    selection_filename=sdpostpipelineutils.get_attached_parameter__global(files,'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank
    files=sdsimplefilter.run(files,'status',sdconst.TRANSFER_STATUS_NEW,'keep')

    count=len(files) # how many files to be inserted
    total_size=sum(int(f['size']) for f in files)

    if count>0:
        insertion_group_id=sdsqlutils.nextval('insertion_group_id','history') # this is uniq identifier for all inserted files during this run
        files=add_insertion_group_id(files,insertion_group_id)

        # TODO: maybe add a way to prevent progress (may be usefull when using 'upgrade' action)
        ProgressThread.start(sleep=0.1,running_message='',end_message='') # spinner start

        add_files(files)

        ProgressThread.stop() # spinner stop

        sddao.add_history_line(sdconst.ACTION_ADD,selection_filename,insertion_group_id)

    sdlog.info("SDENQUEU-001","%i new files added (total size=%i,selection=%s)"%(count,total_size,selection_filename))

    return count
Ejemplo n.º 9
0
def pexec(args):
    import sdsearch, sdpporder, sddb, syndautils, sdconst, sdpostpipelineutils, sdhistorydao, sddeferredbefore, sddomainutils

    if args.order_name=='cdf':
        selection_filename=None

        # use search-api operator to build datasets list
        stream=syndautils.get_stream(subcommand=args.subcommand,selection_file=args.selection_file,no_default=args.no_default)
        sddeferredbefore.add_forced_parameter(stream,'type','Dataset')

        dataset_found_count=0
        order_variable_count=0
        order_dataset_count=0
        for facets_group in stream: # we need to process each facets_group one by one because of TAG45345JK3J53K
            
            metadata=sdsearch.run(stream=[facets_group],post_pipeline_mode='dataset') # TAGJ43KJ234JK

            dataset_found_count+=metadata.count()

            if metadata.count() > 0:

                # WART
                # (gets overwritten at each iteration, but not a big deal as always the same value)
                if selection_filename is None: # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None))

                    dataset=metadata.get_one_file()
                    selection_filename=sdpostpipelineutils.get_attached_parameter__global([dataset],'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank

                for d in metadata.get_files(): # warning: load list in memory
                    if d['status']==sdconst.DATASET_STATUS_COMPLETE:

                        # TAG45J4K45JK

                        # first, send cdf variable order
                        # (note: total number of variable event is given by: "total+=#variable for each ds")
                        for v in d['variable']:
                            if v in facets_group['variable']: # TAG45345JK3J53K (we check here that the variable has been asked for in the first place)
                                order_variable_count+=1

                                # hack
                                if sddomainutils.is_one_var_per_ds(d['project']): # maybe move this test at TAG45J4K45JK line, and replace 'EVENT_CDF_VARIABLE_O' by a dataset level event (note however that the choice about passing 'EVENT_CDF_VARIABLE_O' event as variable or dataset is arbitrary, both work. But passing as variable is a bit strange as variable appears in both dataset_pattern and variable columns)
                                    e_names=[sdconst.EVENT_CDF_INT_VARIABLE_O, sdconst.EVENT_CDF_COR_VARIABLE_O]

                                    # this case is a bit awkward as we have 'variable' in both dataset_pattern and variable columns..

                                else:
                                    e_names=[sdconst.EVENT_CDF_INT_VARIABLE_N, sdconst.EVENT_CDF_COR_VARIABLE_N]

                                for e_name in e_names:
                                    sdpporder.submit(e_name,d['project'],d['model'],d['local_path'],variable=v,commit=False)

                        # second, send cdf dataset order
                        if d['project'] in sdconst.PROJECT_WITH_ONE_VARIABLE_PER_DATASET:

                            # we do not trigger 'dataset' level event in this case
                            pass
                        else:                        

                            order_dataset_count+=1

                            e_names=[sdconst.EVENT_CDF_INT_DATASET, sdconst.EVENT_CDF_COR_DATASET]
                            for e_name in e_names:
                                    sdpporder.submit(e_name,d['project'],d['model'],d['local_path'],commit=False)

        sddb.conn.commit()

        if dataset_found_count>0:
            if order_dataset_count==0 and order_variable_count==0:
                print_stderr("Data not ready (data must be already downloaded before performing pexec task): operation cancelled")   
            else:
                sdhistorydao.add_history_line(sdconst.ACTION_PEXEC,selection_filename)

                print_stderr("Post-processing task successfully submitted (order_dataset_count=%d,order_variable_count=%d)"%(order_dataset_count,order_variable_count))
        else:
            print_stderr('Data not found')

    elif args.order_name=='cds':
        selection_filename = None

        # use search-api operator to build datasets list
        stream = syndautils.get_stream(subcommand=args.subcommand, selection_file=args.selection_file, no_default=args.no_default)
        sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset')

        dataset_found_count = 0
        order_variable_count = 0
        for facets_group in stream:  # we need to process each facets_group one by one because of TAG45345JK3J53K

            metadata = sdsearch.run(stream=[facets_group], post_pipeline_mode='dataset')  # TAGJ43KJ234JK

            dataset_found_count += metadata.count()

            if metadata.count() > 0:

                # WART
                # (gets overwritten at each iteration, but not a big deal as always the same value)
                if selection_filename is None:  # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None))

                    dataset = metadata.get_one_file()
                    selection_filename = sdpostpipelineutils.get_attached_parameter__global([dataset], 'selection_filename')  # note that if no files are found at all for this selection (no matter the status), then the filename will be blank

                for d in metadata.get_files():  # warning: load list in memory
                    if d['status'] == sdconst.DATASET_STATUS_COMPLETE:

                        # TAG45J4K45JK

                        # send cds variable order
                        # (note: total number of variable event is given by: "total+=#variable for each ds")
                        for v in d['variable']:
                            if v in facets_group['variable']:  # TAG45345JK3J53K (we check here that the variable has been asked for in the first place)
                                order_variable_count += 1
                                sdpporder.submit(sdconst.EVENT_CDS_VARIABLE, d['project'], d['model'], d['local_path'], variable=v, commit=False)

        sddb.conn.commit()

        if dataset_found_count > 0:
            if order_variable_count == 0:
                print_stderr("Data not ready (data must be already downloaded before performing pexec task): operation cancelled")
            else:
                sdhistorydao.add_history_line(sdconst.ACTION_PEXEC, selection_filename)

                print_stderr(
                    "Post-processing task successfully submitted (order_variable_count=%d)" % (order_variable_count))
        else:
            print_stderr('Data not found')

    else:
        print_stderr("Invalid order name ('%s')"%args.order_name)
        return 1

    return 0
Ejemplo n.º 10
0
def run(metadata, timestamp_right_boundary=None):
    """
    Returns
        Number of enqueued items.
    """

    if metadata.count() < 1:
        return 0

    f = metadata.get_one_file()
    selection_filename = sdpostpipelineutils.get_attached_parameter__global(
        [f], 'selection_filename'
    )  # note that if no files are found at all for this selection (no matter the status), then the filename will be blank
    selection_file = sdpostpipelineutils.get_attached_parameter__global(
        [f], 'selection_file'
    )  # note that if no files are found at all for this selection (no matter the status), then 'selection_file' will be blank

    metadata = sdsimplefilter.run(metadata, 'status',
                                  sdconst.TRANSFER_STATUS_NEW, 'keep')

    count = metadata.count()  # how many files to be inserted
    total_size = metadata.size

    if count > 0:

        sdlog.info("SDENQUEU-102", "Add insertion_group_id..")

        insertion_group_id = sdsqlutils.nextval(
            'insertion_group_id', 'history'
        )  # this is uniq identifier for all inserted files during this run
        po = sdpipelineprocessing.ProcessingObject(add_insertion_group_id,
                                                   insertion_group_id)
        metadata = sdpipelineprocessing.run_pipeline(metadata, po)

        if sdconfig.progress:
            sdprogress.ProgressThread.start(sleep=0.1,
                                            running_message='',
                                            end_message='')  # spinner start

        sdlog.info("SDENQUEU-103", "Insert files and datasets..")

        po = sdpipelineprocessing.ProcessingObject(add_files)
        metadata = sdpipelineprocessing.run_pipeline(metadata, po)

        sdlog.info("SDENQUEU-104", "Fill timestamp..")

        fix_timestamp()

        sddb.conn.commit(
        )  # final commit (we do all insertion/update in one transaction).

        if sdconfig.progress:
            sdprogress.ProgressThread.stop()  # spinner stop

        histo_crea_date = sdtime.search_api_datetime_format_to_sqlite_datetime_format(
            timestamp_right_boundary
        ) if timestamp_right_boundary is not None else None

        sdhistory.add_history_line(action=sdconst.ACTION_ADD,
                                   selection_file=selection_file,
                                   insertion_group_id=insertion_group_id,
                                   crea_date=histo_crea_date)

    sdlog.info(
        "SDENQUEU-001", "%i new file(s) added (total size=%i,selection=%s)" %
        (count, total_size, selection_filename))

    return count
Ejemplo n.º 11
0
def pexec(args):
    import sdsearch, sdpporder, sddb, syndautils, sdconst, sdpostpipelineutils, sdhistorydao, sddeferredbefore, sddomainutils

    if args.order_name == 'cdf':
        selection_filename = None

        # use search-api operator to build datasets list
        stream = syndautils.get_stream(subcommand=args.subcommand,
                                       selection_file=args.selection_file,
                                       no_default=args.no_default)
        sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset')

        dataset_found_count = 0
        order_variable_count = 0
        order_dataset_count = 0
        for facets_group in stream:  # we need to process each facets_group one by one because of TAG45345JK3J53K

            metadata = sdsearch.run(
                stream=[facets_group],
                post_pipeline_mode='dataset')  # TAGJ43KJ234JK

            dataset_found_count += metadata.count()

            if metadata.count() > 0:

                # WART
                # (gets overwritten at each iteration, but not a big deal as always the same value)
                if selection_filename is None:  # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None))

                    dataset = metadata.get_one_file()
                    selection_filename = sdpostpipelineutils.get_attached_parameter__global(
                        [dataset], 'selection_filename'
                    )  # note that if no files are found at all for this selection (no matter the status), then the filename will be blank

                for d in metadata.get_files():  # warning: load list in memory
                    if d['status'] == sdconst.DATASET_STATUS_COMPLETE:

                        # TAG45J4K45JK

                        # first, send cdf variable order
                        # (note: total number of variable event is given by: "total+=#variable for each ds")
                        for v in d['variable']:
                            if v in facets_group[
                                    'variable']:  # TAG45345JK3J53K (we check here that the variable has been asked for in the first place)
                                order_variable_count += 1

                                # hack
                                if sddomainutils.is_one_var_per_ds(
                                        d['project']
                                ):  # maybe move this test at TAG45J4K45JK line, and replace 'EVENT_CDF_VARIABLE_O' by a dataset level event (note however that the choice about passing 'EVENT_CDF_VARIABLE_O' event as variable or dataset is arbitrary, both work. But passing as variable is a bit strange as variable appears in both dataset_pattern and variable columns)
                                    e_names = [
                                        sdconst.EVENT_CDF_INT_VARIABLE_O,
                                        sdconst.EVENT_CDF_COR_VARIABLE_O
                                    ]

                                    # this case is a bit awkward as we have 'variable' in both dataset_pattern and variable columns..

                                else:
                                    e_names = [
                                        sdconst.EVENT_CDF_INT_VARIABLE_N,
                                        sdconst.EVENT_CDF_COR_VARIABLE_N
                                    ]

                                for e_name in e_names:
                                    sdpporder.submit(e_name,
                                                     d['project'],
                                                     d['model'],
                                                     d['local_path'],
                                                     variable=v,
                                                     commit=False)

                        # second, send cdf dataset order
                        if d['project'] in sdconst.PROJECT_WITH_ONE_VARIABLE_PER_DATASET:

                            # we do not trigger 'dataset' level event in this case
                            pass
                        else:

                            order_dataset_count += 1

                            e_names = [
                                sdconst.EVENT_CDF_INT_DATASET,
                                sdconst.EVENT_CDF_COR_DATASET
                            ]
                            for e_name in e_names:
                                sdpporder.submit(e_name,
                                                 d['project'],
                                                 d['model'],
                                                 d['local_path'],
                                                 commit=False)

        sddb.conn.commit()

        if dataset_found_count > 0:
            if order_dataset_count == 0 and order_variable_count == 0:
                print_stderr(
                    "Data not ready (data must be already downloaded before performing pexec task): operation cancelled"
                )
            else:
                sdhistorydao.add_history_line(sdconst.ACTION_PEXEC,
                                              selection_filename)

                print_stderr(
                    "Post-processing task successfully submitted (order_dataset_count=%d,order_variable_count=%d)"
                    % (order_dataset_count, order_variable_count))
        else:
            print_stderr('Data not found')

    elif args.order_name == 'cds':
        selection_filename = None

        # use search-api operator to build datasets list
        stream = syndautils.get_stream(subcommand=args.subcommand,
                                       selection_file=args.selection_file,
                                       no_default=args.no_default)
        sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset')

        dataset_found_count = 0
        order_variable_count = 0
        for facets_group in stream:  # we need to process each facets_group one by one because of TAG45345JK3J53K

            metadata = sdsearch.run(
                stream=[facets_group],
                post_pipeline_mode='dataset')  # TAGJ43KJ234JK

            dataset_found_count += metadata.count()

            if metadata.count() > 0:

                # WART
                # (gets overwritten at each iteration, but not a big deal as always the same value)
                if selection_filename is None:  # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None))

                    dataset = metadata.get_one_file()
                    selection_filename = sdpostpipelineutils.get_attached_parameter__global(
                        [dataset], 'selection_filename'
                    )  # note that if no files are found at all for this selection (no matter the status), then the filename will be blank

                for d in metadata.get_files():  # warning: load list in memory
                    if d['status'] == sdconst.DATASET_STATUS_COMPLETE:

                        # TAG45J4K45JK

                        # send cds variable order
                        # (note: total number of variable event is given by: "total+=#variable for each ds")
                        for v in d['variable']:
                            if v in facets_group[
                                    'variable']:  # TAG45345JK3J53K (we check here that the variable has been asked for in the first place)
                                order_variable_count += 1
                                sdpporder.submit(sdconst.EVENT_CDS_VARIABLE,
                                                 d['project'],
                                                 d['model'],
                                                 d['local_path'],
                                                 variable=v,
                                                 commit=False)

        sddb.conn.commit()

        if dataset_found_count > 0:
            if order_variable_count == 0:
                print_stderr(
                    "Data not ready (data must be already downloaded before performing pexec task): operation cancelled"
                )
            else:
                sdhistorydao.add_history_line(sdconst.ACTION_PEXEC,
                                              selection_filename)

                print_stderr(
                    "Post-processing task successfully submitted (order_variable_count=%d)"
                    % (order_variable_count))
        else:
            print_stderr('Data not found')

    else:
        print_stderr("Invalid order name ('%s')" % args.order_name)
        return 1

    return 0