Beispiel #1
0
def naive_test():
    '''run naive integration test'''
    to_remove = [
        'tuner_search_space.json', 'tuner_result.txt', 'assessor_result.txt'
    ]
    to_remove = list(map(lambda file: 'naive_test/' + file, to_remove))
    remove_files(to_remove)

    proc = subprocess.run(
        ['nnictl', 'create', '--config', 'naive_test/local.yml'])
    assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode

    print('Spawning trials...')

    nnimanager_log_path = get_nni_log_path(EXPERIMENT_URL)
    current_trial = 0

    for _ in range(120):
        time.sleep(1)

        tuner_status = read_last_line('naive_test/tuner_result.txt')
        assessor_status = read_last_line('naive_test/assessor_result.txt')
        experiment_status = is_experiment_done(nnimanager_log_path)

        assert tuner_status != 'ERROR', 'Tuner exited with error'
        assert assessor_status != 'ERROR', 'Assessor exited with error'

        if experiment_status:
            break

        if tuner_status is not None:
            for line in open('naive_test/tuner_result.txt'):
                if line.strip() == 'ERROR':
                    break
                trial = int(line.split(' ')[0])
                if trial > current_trial:
                    current_trial = trial
                    print('Trial #%d done' % trial)

    assert experiment_status, 'Failed to finish in 2 min'

    ss1 = json.load(open('naive_test/search_space.json'))
    ss2 = json.load(open('naive_test/tuner_search_space.json'))
    assert ss1 == ss2, 'Tuner got wrong search space'

    tuner_result = set(open('naive_test/tuner_result.txt'))
    expected = set(open('naive_test/expected_tuner_result.txt'))
    # Trials may complete before NNI gets assessor's result,
    # so it is possible to have more final result than expected
    assert tuner_result.issuperset(expected), 'Bad tuner result'

    assessor_result = set(open('naive_test/assessor_result.txt'))
    expected = set(open('naive_test/expected_assessor_result.txt'))
    assert assessor_result == expected, 'Bad assessor result'

    subprocess.run(['nnictl', 'stop'])
    snooze()
Beispiel #2
0
def remove_rename(args):
    print('Crop start')
    start = time.time()  # 시작 시간 저장
    # save_path check
    utils.check_directory(args.save_path)

    # read files & sorting
    annotation_files = os.listdir(args.annotation_path)
    images_files = os.listdir(args.image_path)
    annotation_files_sort = sorted(annotation_files)
    images_files_sort = sorted(images_files)
    assert (len(annotation_files_sort) != len(images_files_sort),
            '파일 개수가 맞지 않음 anno : {0}, images : {1}'.format(
                len(annotation_files), len(images_files)))

    orgin_ano = []
    orgin_img = []
    re_ano = []
    re_img = []
    for i in annotation_files_sort:
        annotation_file = utils.tag_remove_parser(i)
        orgin_ano.append(i)
        re_ano.append(annotation_file)

    for i in images_files_sort:
        images_file = utils.tag_remove_parser(i)
        orgin_img.append(i)
        re_img.append(images_file)

    orgin_ano = np.array(orgin_ano)
    orgin_img = np.array(orgin_img)
    re_ano = np.array(re_ano)
    re_img = np.array(re_img)

    # Remove file
    utils.remove_files(re_ano, re_img, orgin_img, args.image_path)
    utils.remove_files(re_img, re_ano, orgin_ano, args.annotation_path)

    # Modify file name
    for i in annotation_files_sort:
        annotation_file = utils.tag_remove_parser(i)
        os.rename(os.path.join(args.annotation_path, i),
                  os.path.join(args.annotation_path, annotation_file + '.xml'))

    for i in images_files_sort:
        images_file = utils.tag_remove_parser(i)
        os.rename(os.path.join(args.image_path, i),
                  os.path.join(args.image_path, images_file + '.jpg'))

    print("Preprocessing time :", time.time() - start)
Beispiel #3
0
    def error_cleanup(self, input_schema_name, input_table_name, run_id, path=None, conn_metadata=None, conn_source=None, conn_target=None):
        method_name = self.class_name + ": " + "error_cleanup"
        print_hdr = "[" + method_name + ": " + self.data_path + ": " + str(self.load_id) + "] - "
        print (print_hdr + "Entered")

        if path is None:
            path = self.config_list['misc_hdfsStagingPath']
        remove_files(path, input_schema_name, input_table_name)

        if input_table_name is not None:
            self.update_control(input_schema_name, input_table_name, self.CONTROL_STATUS_ERROR, run_id)

        if conn_metadata is not None and not conn_metadata.closed:
            conn_metadata.close()

        if conn_source is not None and not conn_source.closed:
            conn_source.close()

        if conn_target is not None:
            conn_target.close()
    def error_cleanup(self,
                      input_schema_name,
                      input_table_name,
                      run_id,
                      path=None,
                      conn_metadata=None,
                      conn_source=None,
                      conn_target=None,
                      target_path=None):
        method_name = self.class_name + ": " + "error_cleanup"
        print_hdr = "[" + method_name + ": " + self.data_path + ": " + str(
            self.load_id) + "] - "
        print(logdt.now().strftime('[%Y-%m-%d %H:%M:%S] ') + print_hdr +
              "Entered")

        if path is None:
            path = self.config_list['misc_hdfsStagingPath']
        if self.data_path.find("GP2HDFS") <> -1 or self.data_path.find(
                "HDFS2MIR") <> -1:
            remove_files(path, input_schema_name, input_table_name)

        if self.data_path.find("KFK2Hive") <> -1:
            if path is not None and target_path is not None:
                move_hdfs_files(path, target_path)

        if self.data_path.find("SRC2Hive") <> -1:
            if path is not None and target_path is not None:
                move_files(path, (target_path + input_schema_name))

        if input_table_name is not None:
            self.update_control(input_schema_name, input_table_name,
                                self.CONTROL_STATUS_ERROR, run_id)

        if conn_metadata is not None and not conn_metadata.closed:
            conn_metadata.close()

        if conn_source is not None and not conn_source.closed:
            conn_source.close()

        if conn_target is not None:
            conn_target.close()
Beispiel #5
0
def save_report(data, outfile):
    # delete previous report
    if not remove_files([outfile]):
        raise OverloadError('Unable to delete previous default '
                            'validation report: {}.'.format(outfile))

    report = []
    for k, v in data.iteritems():
        report.append('\n{} - barcode dups:'.format(k))
        dups = []
        for f, p in v:
            dups.append('\tfile: {} -- record no:{}'.format(f, p))
        report.append('\n'.join(sorted(dups)))

    if report == []:
        report = ['No errors found']
    try:
        with open(outfile, 'w') as file:
            file.write('\n'.join(report))
    except IOError as e:
        raise OverloadError(
            'Unable to create a new default validation report. '
            'Error: {}'.format(e))
Beispiel #6
0
    sheet = sheet.active
    data = return_data(sheet, ['A', 'B'], '')
    return_my_sudents(data, argvs[3], argvs[4])

if argvs[1] == '8':
    submission_folder = argvs[2]
    sandbox = argvs[3]
    output_file = argvs[4]
    command = 'make ' + argvs[5]
    check_homework(submission_folder, sandbox, command, output_file)

if argvs[1] == '9':
    source_folder = argvs[2]
    student_list = argvs[3]
    get_students(source_folder, student_list, 'left_to_grade', False)

if argvs[1] == '10':
    submission_folder = argvs[2]
    basefile = argvs[3]
    language = 'c'
    send_moss_dir_mode(submission_folder, basefile, language)

if argvs[1] == '1000':
    remove_files()

# Set the grade collum
# gradeCol = 11
# Make all naughty ones red in the .xlsx
# num = above_threshold('Catalog.xlsx', data, 0, gradeCol)
# print(num)
Beispiel #7
0
 def remove_files(self):
     ut.remove_files(self.file_names)
def main():

    options, logger = parse_args(argv)
    check_executables_in_path(options, logger)

    outdir = path.abspath(options.out_dir)
    options.hmm_out_dir = '%s/hmmsearchresults' % (outdir)
    options.res_dir = '%s/retrievedFragments' % (outdir)
    options.final_gene_dir = '%s/predictedGenes' % (outdir)
    options.assembly_dir = '%s/spades_assembly' % (outdir)
    if not options.tmp_dir:
        options.tmp_dir = '%s/tmpdir' % (outdir)

    if path.isdir(options.out_dir) and not options.force:
        msg = ('The directory {0} already exists. To overwrite use the'
               ' --force flag').format(options.out_dir)
        logger.error(msg)
        logger.info('Exiting pipeline')
        exit()
    elif path.isdir(options.out_dir) and options.force:
        if not utils.remove_files(options.final_gene_dir, options.res_dir):
            logger.info('Exiting pipeline')
            exit()
    else:
        utils.create_dir(options.out_dir)

    for infile in options.infiles:
        if not path.isfile(infile):
            msg = 'The provided input file {0} does not exist.'.format(infile)
            logger.critical(msg)
            logger.info('Exiting pipeline')
            exit()

    check_arguments(options, logger)

    utils.create_dir(options.hmm_out_dir)
    utils.create_dir(options.tmp_dir)
    utils.create_dir(options.final_gene_dir)

    if options.meta:
        utils.create_dir(options.res_dir)
        if not options.no_quality_filtering:
            options.trimmed_dir = '%s/trimmedReads' % (path.abspath(
                options.res_dir))
            utils.create_dir(options.trimmed_dir)

    summaryFile = '%s/results_summary.txt' % (outdir)
    Results = ResultsSummary(summaryFile, len(options.infiles),
                             options.hmm_model)

    logger.info('Starting fARGene')
    logger.info('Starting pipeline, planning to analyze %s files',
                len(options.infiles))
    logger.info('Running on %s processes' % str(options.processes))

    if not options.meta:
        parse_fasta_input(options, Results, logger)
        Results.write_summary(False)
        numGenes = Results.retrievedSequences
        retrieved = 'possible genes'
    else:
        options.protein = False
        parse_fastq_input(options, Results, logger)
        Results.write_summary(True)
        numGenes = Results.retrievedContigs
        retrieved = 'retrieved contigs'
    logger.info('Done with pipeline')

    msg = ('fARGene is done.\n'
           'Total number of {}: {}\n'
           'Total number of predicted ORFS longer than {} nt: {}\n'
           'Output can be found in {}').format(retrieved, numGenes,
                                               options.min_orf_length,
                                               Results.predictedOrfs, outdir)
    logger.info(msg)
Beispiel #9
0
if __name__ == '__main__':

    ### Change items here ##############################################
    # get structurals
    datadir = '/home/jagust/graph/data/mri1.5/tr220'
    anatstr = 'B*/raw/B*_anat.nii.gz'
    anatomicals = get_files_old_only(datadir, anatstr)
    ####################################################################

    # run dartel on cohort
    
    gms = utils.get_files(datadir, 'B*/despike_ants_realign/coreg_masks/aparcaseg.nii.gz')
    wms = utils.get_files(datadir, 'B*/despike_ants_realign/coreg_masks/B*_WM_mask.nii.gz')
    gms.sort()
    wms.sort()
    files = []
    pth, nme, ext = filemanip.split_filename(gms[0])
    datestr = utils.make_datestr()
    tmplt_nme = 'dartel_%s'%(datestr)
    templatedir = '/home/jagust/graph/data/mri1.5/tr220/template'
    dout = spm_dartel_make(gms, wms, templatedir, tmplt_nme)
    
    #template = get_files_old_only(datadir,'B*/anat/vbm8/%s*'%(tmplt_nme))

    templatedir, exists = utils.make_dir(datadir,'template')
    newtemplate = utils.copy_files(template, templatedir)
    utils.remove_files(template)
    #flowfieldstmp = utils.get_files(datadir,'*/anat/vbm8/*%s*'%(tmplt_nme))
    flowfields = move_flowfields(flowfieldstmp)
    dartellog = write_dartel_log(newtemplate, flowfields)
Beispiel #10
0
def run_processing(files, system, library, agent, api_type, api_name, template,
                   output_directory, progbar, current_process_label):
    """
    args:
        template: instance of NYPLOrderTemplate class
    """

    # agent argument is 3 letter code

    module_logger.debug('PVR process launched.')

    # tokens and sessions are opened on this level

    # determine destination API
    if api_type == 'Platform API':
        module_logger.debug('Creating Platform API session.')
        try:
            session = open_platform_session(api_name)
        except OverloadError:
            raise
    elif api_type == 'Z3950':
        module_logger.debug(
            'retrieving Z3950 settings for {}'.format(api_name))
        user_data = shelve.open(USER_DATA)
        target = user_data['Z3950s'][api_name]
        user_data.close()

    elif api_type == 'Sierra API':
        module_logger.debug('Connecting to Sierra API')

    # clean-up batch metadata & stats
    if not template:
        template_name = None
    else:
        template_name = template.tName
    module_logger.debug('Opening BATCH_META.')
    batch = shelve.open(BATCH_META, writeback=True)
    module_logger.debug('BATCH_META has been emptied from previous content.')
    timestamp = datetime.now()
    batch['timestamp'] = timestamp
    batch['system'] = system
    batch['library'] = library
    batch['agent'] = agent
    batch['template'] = template_name
    batch['file_names'] = files
    batch.close()
    module_logger.debug('BATCH_META new data: {}, {}, {}, {}, {}, {}'.format(
        timestamp, system, library, agent, template_name, files))

    stats = shelve.open(BATCH_STATS, writeback=True)
    stats.clear()

    if not remove_files(BARCODES):
        module_logger.error(
            'Unable to empty BARCODES storage at location {}'.format(BARCODES))
        raise OverloadError('Unable to delete barcodes from previous batch.')
    else:
        module_logger.debug(
            'BATCH_STATS has been emptied from previous content.')

    # determine output mrc files namehandles
    if agent == 'cat':
        date_today = date.today().strftime('%y%m%d')
        fh_dups = os.path.join(output_directory,
                               '{}.DUP-0.mrc'.format(date_today))
        fh_new = os.path.join(output_directory,
                              '{}.NEW-0.mrc'.format(date_today))

        # delete existing files to start over from scratch
        if not remove_files([fh_new, fh_dups]):
            module_logger.warning(
                'Unable to delete PVF output files from previous batch.')
            raise OverloadError(
                'Unable to delete output files from previous batch.')

    elif agent in ('sel', 'acq'):
        # remove mrc extention if exists
        tail = os.path.split(files[0])[1]
        if tail[-4:] == '.mrc':
            tail = tail[:-4]
        tail = '{}.PRC-0.mrc'.format(tail)
        fh = os.path.join(output_directory, tail)

        # delete existing files to start over from scratch
        if not remove_files([fh]):
            module_logger.warning(
                'Unable to delete PVF output files from previous batch.')
            raise OverloadError(
                'Unable to delete output files from previous batch.')

    # create reference index
    module_logger.debug('Creatig vendor index data for {}-{}'.format(
        system, agent))
    if agent == 'cat':
        rules = './rules/cat_rules.xml'
        vx = vendor_index(rules, system)  # wrap in exception?
    elif agent in ('sel', 'acq'):
        if system == 'nypl':
            query_matchpoints = dict()
            try:
                if template.match1st == 'sierra_id':
                    query_matchpoints['primary'] = ('id', template.match1st)
                else:
                    query_matchpoints['primary'] = ('tag', template.match1st)
                if template.match2nd is not None:
                    if template.match2nd == 'sierra_id':
                        query_matchpoints['secondary'] = ('id',
                                                          template.match2nd)
                    else:
                        query_matchpoints['secondary'] = ('tag',
                                                          template.match2nd)
                if template.match3rd is not None:
                    if template.match3rd == 'sierra_id':
                        query_matchpoints['tertiary'] = ('id',
                                                         template.match3rd)
                    else:
                        query_matchpoints['tertiary'] = ('tag',
                                                         template.match3rd)
            except NoResultFound:
                raise OverloadError('Unable to find template {}.\n'
                                    'Please verify it exists.'.format(
                                        template.tName))
            except AttributeError:
                raise OverloadError('Error while applying order template.')
        else:
            raise OverloadError(
                'selection workflow for BPL not implemented yet')

    # run queries and results analysis for each bib in each file
    n = 0
    f = 0
    for file in files:
        f += 1
        module_logger.debug(
            'Opening new MARC reader for file: {}'.format(file))
        reader = read_marc21(file)

        current_process_label.set('quering...')
        for bib in reader:
            n += 1

            if agent == 'cat':
                vendor = identify_vendor(bib, vx)

                try:
                    query_matchpoints = get_query_matchpoint(vendor, vx)
                    module_logger.debug(
                        'Cat vendor index has following query matchpoints: '
                        '{} for vendor {}.'.format(query_matchpoints, vendor))

                except KeyError:
                    module_logger.critical(
                        'Unable to match vendor {} with data '
                        'in cat vendor index'.format(vendor))
            elif agent in ('sel', 'acq'):
                # vendor code
                if system == 'nypl':
                    vendor = template.vendor
                    if vendor is None:
                        # do not apply but keep for stats
                        vendor = 'UNKNOWN'

            if vendor == 'UNKNOWN':
                module_logger.debug(
                    'Encounted unidentified vendor in record # : {} '
                    'in file {} (system={}, library={}, agent={})'.format(
                        n, file, system, library, agent))

            # determine vendor bib meta
            meta_in = VendorBibMeta(bib, vendor=vendor, dstLibrary=library)
            module_logger.info('Vendor bib meta: {}'.format(str(meta_in)))

            # store barcodes found in vendor files for verification
            module_logger.debug('Storing barcodes for verification.')
            with open(BARCODES, 'a') as barcodes_file:
                for b in meta_in.barcodes:
                    barcodes_file.write(b + '\n')

            # Platform API workflow
            if api_type == 'Platform API':
                matchpoint = query_matchpoints['primary'][1]
                module_logger.debug(
                    'Using primary marchpoint: {}.'.format(matchpoint))
                try:
                    result = run_platform_queries(api_type, session, meta_in,
                                                  matchpoint)

                except APITokenExpiredError:
                    module_logger.info(
                        'Platform token expired. '
                        'Requesting new one and opening new session.')
                    session = open_platform_session(api_name)
                    result = platform_queries_manager(api_type, session,
                                                      meta_in, matchpoint)

                # run_patform_queries returns tuple (status, response in json)
                meta_out = []

                if result[0] == 'hit':
                    meta_out = platform2meta(result[1])

                elif result[0] == 'nohit':
                    # requery with alternative matchpoint
                    if 'secondary' in query_matchpoints:
                        matchpoint = query_matchpoints['secondary'][1]
                        module_logger.debug(
                            'Using secondary marchpoint: {}.'.format(
                                matchpoint))

                        # run platform request for the secondary matchpoint
                        try:
                            result = run_platform_queries(
                                api_type, session, meta_in, matchpoint)

                        except APITokenExpiredError:
                            module_logger.info(
                                'Requesting new Platform token. '
                                'Opening new session.')

                            session = open_platform_session(api_name)
                            result = run_platform_queries(
                                api_type, session, meta_in, matchpoint)
                            # other exceptions raised in run_platform_queries

                        if result[0] == 'hit':
                            meta_out = platform2meta(result[1])
                        elif result[0] == 'nohit':
                            # run query for the 3rd matchpoint
                            if 'tertiary' in query_matchpoints:
                                matchpoint = query_matchpoints['tertiary'][1]
                                module_logger.debug(
                                    'Using tertiary marchpoint: {}.'.format(
                                        matchpoint))

                                # run platform request for the tertiary
                                # matchpoint
                                try:
                                    result = run_platform_queries(
                                        api_type, session, meta_in, matchpoint)

                                except APITokenExpiredError:
                                    module_logger.info(
                                        'Requesting new Platform token. '
                                        'Opening new session.')

                                    session = open_platform_session(api_name)
                                    result = run_platform_queries(
                                        api_type, session, meta_in, matchpoint)
                                if result[0] == 'hit':
                                    meta_out = platform2meta(result[1])
                                elif result[0] == 'error':
                                    raise OverloadError(
                                        'Platform server error.')
                        elif result[0] == 'error':
                            raise OverloadError('Platform server error.')
                    else:
                        module_logger.debug(
                            'No secondary matchpoint specified. '
                            'Ending queries.')
                elif result[0] == 'error':
                    raise OverloadError('Platform server error.')

            # queries performed via Z3950
            elif api_type == 'Z3950':
                meta_out = []
                matchpoint = query_matchpoints['primary'][1]
                module_logger.debug(
                    'Using primary marchpoint: {}.'.format(matchpoint))
                status, bibs = z3950_query_manager(target, meta_in, matchpoint)
                if status == 'hit':
                    meta_out = bibs2meta(bibs)
                elif status == 'nohit' and \
                        'secondary' in query_matchpoints:
                    matchpoint = query_matchpoints['secondary'][1]
                    module_logger.debug(
                        'Using secondary matchpoint: {}'.format(matchpoint))
                    status, bibs = z3950_query_manager(target, meta_in,
                                                       matchpoint)
                    if status == 'hit':
                        meta_out = bibs2meta(bibs)
                    elif status == 'nohit' and \
                            'tertiary' in query_matchpoints:
                        matchpoint = query_matchpoints['tertiary'][1]
                        module_logger.debug(
                            'Using tertiary matchpoint: {}'.format(matchpoint))
                        status, bibs = z3950_query_manager(
                            target, meta_in, matchpoint)
                        if status == 'hit':
                            meta_out = bibs2meta(bibs)
                module_logger.info('Retrieved bibs meta: {}'.format(meta_out))

            # queries performed via Sierra API
            elif api_type == 'Sierra API':
                module_logger.error('Sierra API is not implemented yet.')
                raise OverloadError('Sierra API is not implemented yet.')
            else:
                module_logger.error('Invalid api_type')
                raise OverloadError('Invalid api_type encountered.')

            if system == 'nypl':
                analysis = PVR_NYPLReport(agent, meta_in, meta_out)
            elif system == 'bpl':
                analysis = PVR_BPLReport(agent, meta_in, meta_out)

            module_logger.debug('Analyzing query results and vendor bib')
            analysis = analysis.to_dict()

            # apply patches if needed
            try:
                bib = patches.bib_patches(system, library, agent, vendor, bib)
            except AssertionError as e:
                module_logger.warning(
                    'Unable to patch bib. Error: {}'.format(e))
                analysis['callNo_match'] = False

            module_logger.info('PVF analysis results: {}'.format(analysis))

            # save analysis to shelf for statistical purposes
            stats[str(n)] = analysis

            # output processed records according to analysis
            # add Sierra bib id if matched

            # enforce utf-8 encoding in MARC leader
            bib.leader = bib.leader[:9] + 'a' + bib.leader[10:]

            sierra_id_present = check_sierra_id_presence(system, bib)
            module_logger.debug(
                'Checking if vendor bib has Sierra ID provided: '
                '{}'.format(sierra_id_present))

            if not sierra_id_present and \
                    analysis['target_sierraId'] is not None:

                try:
                    module_logger.info(
                        'Adding target Sierra id ({}) MARC field '
                        'to vendor record {}.'.format(
                            analysis['vendor_id'],
                            analysis['target_sierraId']))
                    bib.add_field(
                        create_target_id_field(system,
                                               analysis['target_sierraId']))

                except ValueError as e:
                    module_logger.error(e)
                    raise OverloadError(e)

            # add fields form bib & order templates
            module_logger.debug(
                'Adding template field(s) to the vendor record.')

            if agent == 'cat':
                templates = vx[vendor].get('bib_template')
                module_logger.debug('Selected CAT templates for {}: {}'.format(
                    vendor, templates))
                for catTemp in templates:
                    # skip if present or always add
                    if catTemp['tag'] == '949' and \
                            analysis['action'] == 'attach':
                        pass
                    elif catTemp['option'] == 'skip':
                        if catTemp['tag'] not in bib:
                            module_logger.debug('Field {} not present, adding '
                                                'from template'.format(
                                                    catTemp['tag']))
                            new_field = create_field_from_template(catTemp)
                            bib.add_field(new_field)
                        else:
                            module_logger.debug(
                                'Field {} found. Skipping.'.format(
                                    catTemp['tag']))
                    elif catTemp['option'] == 'add':
                        module_logger.debug(
                            'Field {} being added without checking '
                            'if already present'.format(catTemp['tag']))
                        new_field = create_field_from_template(catTemp)
                        bib.add_field(new_field)

            elif agent in ('sel', 'acq'):
                # batch template details should be retrieved instead for the
                # whole batch = no need to pull it for each bib

                new_fields = []
                if '960' in bib:
                    for t960 in bib.get_fields('960'):
                        new_field = db_template_to_960(template, t960)
                        if new_field:
                            new_fields.append(new_field)
                    bib.remove_fields('960')
                else:
                    new_field = db_template_to_960(template, None)
                    if new_field:
                        new_fields.append(new_field)

                # add modified fields back to record
                for field in new_fields:
                    bib.add_field(field)

                new_fields = []
                if '961' in bib:
                    for t961 in bib.get_fields('961'):
                        new_field = db_template_to_961(template, t961)
                        if new_field:
                            new_fields.append(new_field)
                    # remove existing fields
                    # (will be replaced by modified ones)
                    bib.remove_fields('961')
                else:
                    new_field = db_template_to_961(template, None)
                    if new_field:
                        new_fields.append(new_field)

                # add modified fields to bib
                for field in new_fields:
                    bib.add_field(field)

                if template.bibFormat and \
                        not sierra_command_tag(bib) and \
                        agent == 'sel':
                    new_field = db_template_to_949(template.bibFormat)
                    bib.add_field(new_field)
                    # it's safer for acquisition to skip command in 949 -
                    # there are conflicts with Import Invoices load table

            # apply bibliographic default location to NYPL brief records
            if system == 'nypl' and agent == 'sel':
                bib = set_nypl_sierra_bib_default_location(library, bib)

            # append to appropirate output file
            if agent == 'cat':
                if analysis['action'] == 'attach':
                    module_logger.debug(
                        'Appending vendor record to the dup file.')
                    write_marc21(fh_dups, bib)
                else:
                    module_logger.debug(
                        'Appending vendor record to the new file.')
                    write_marc21(fh_new, bib)
            else:
                module_logger.debug('Appending vendor record to a prc file.')
                write_marc21(fh, bib)

            # update progbar
            progbar['value'] = n
            progbar.update()

    # dedup new cataloging file
    if agent == 'cat' and os.path.isfile(fh_new):
        current_process_label.set('deduping...')

        dups, combined_count, deduped_fh = dedup_marc_file(fh_new, progbar)

        batch = shelve.open(BATCH_META, writeback=True)
        batch['duplicate_bibs'] = '{} dups merged into {} bibs'.format(
            dups, combined_count)
        batch.close()

        # delete original file and rename deduped
        if deduped_fh is not None:
            try:
                os.remove(fh_new)
                os.rename(deduped_fh, fh_new)
            except WindowsError:
                raise OverloadError('Unable to manipulate deduped file')

    # validate intergrity of process files for cataloging
    files_out = []
    if agent == 'cat':
        if os.path.isfile(fh_dups):
            files_out.append(fh_dups)
        if os.path.isfile(fh_new):
            files_out.append(fh_new)

        valid, missing_barcodes = validate_processed_files_integrity(
            files_out, BARCODES)
        module_logger.debug(
            'Integrity validation: {}, missing_barcodes: {}'.format(
                valid, missing_barcodes))
        if not valid:
            module_logger.error(
                'Barcodes integrity error: {}'.format(missing_barcodes))

    batch = shelve.open(BATCH_META, writeback=True)
    processing_time = datetime.now() - batch['timestamp']

    module_logger.info(
        'Batch processing stats: system={}, library={}, agent={}, user={}, '
        'used template={}, file count={}, files={}, record count={}, '
        'processing time={}'.format(system, library, agent, USER_NAME,
                                    template_name, f,
                                    [os.path.split(file)[1]
                                     for file in files], n, processing_time))
    batch['processing_time'] = processing_time
    batch['processed_files'] = f
    batch['processed_bibs'] = n
    if agent == 'cat':
        batch['processed_integrity'] = valid
        batch['missing_barcodes'] = missing_barcodes
    batch.close()
    stats.close()

    # clean-up
    # close any open session if Platform or Sierra API has been used
    if api_type in ('Platform API', 'Sierra API') and session is not None:
        session.close()
        module_logger.debug('Closing API session.')

    if agent == 'cat' and not valid:
        raise OverloadError(
            'Duplicate or missing barcodes found in processed files.')
Beispiel #11
0
def validate_files(system, agent, files, marcval=False, locval=False):

    valid_files = True
    # mandatory, default validation

    try:
        dup_barcodes = default.barcode_duplicates(files, system)
        if dup_barcodes != {}:
            valid_files = False
        default.save_report(dup_barcodes, DVAL_REP)
    except OverloadError as e:
        module_logger.error('Unable to create default validation report. '
                            'Error: {}'.format(e))
        raise OverloadError(e)

    # MARCEdit MARC syntax validation
    if marcval:
        module_logger.debug('Running MARCEdit validation.')
        # make sure MARCEdit is installed on the machine
        val_engine = marcedit.get_engine()
        if val_engine is None:
            # display error message
            raise OverloadError(
                'Failed to locate cmarcedit.exe or marcrules.txt\n'
                'files of MARCEdit program. Unable to complete\n'
                'MARC validation. Please uncheck the box if you\n'
                'still want to proceed.')
        else:
            cme = val_engine[0]
            rules = val_engine[1]
            report_q = MVAL_REP
            overwrite = True
            for file in files:
                file_q = file
                success_process = marcedit.validate(cme, file_q, report_q,
                                                    rules, overwrite)
                overwrite = False
                if success_process:
                    result = marcedit.validation_check(MVAL_REP)
                    if not result[0]:
                        valid_files = False
                else:
                    valid_files = False
                    raise OverloadError(
                        'Encounted a problem with the file:\n'
                        '{}.\nNot able to validate in MARCEdit'.format(file))

    # delete previous local spec report
    if not remove_files([LSPEC_REP]):
        module_logger.error(
            'Unable to delete pevious local spec validation report.')
        raise OverloadError(
            'Unable to remove previous local spec validation report.')

    # local specification validation
    if locval:
        module_logger.debug('Local specs validation launch.')

        # define local specs rules for each system, agent, and vendor
        try:
            rules = './rules/vendor_specs.xml'
            specs = local_specs.local_specs(system, agent, rules)
        except AttributeError as e:
            module_logger.error('Unable to parse local specs rules.'
                                'Error: {}'.format(e))
            raise OverloadError(e)

        # run the local specs validation
        locval_passed, report = local_specs.local_specs_validation(
            system, files, specs)
        if not locval_passed:
            valid_files = False

        # save the report to a file so the last batch is always remembered.
        try:
            with open(LSPEC_REP, 'w') as file:
                file.write(report)
        except IOError as e:
            module_logger.error(
                'Encountered error while creating local specs validation'
                ' report. Error: {}'.format(e))
            raise OverloadError(
                'Unable to create local spec validation\nreport.')

    return valid_files