Esempio n. 1
0
def main():
    try:
        parser = ArgumentParser(description="", epilog="")
        parser.add_argument("record_number", help="", action="store", type=str)
        parser.add_argument("-g", help="", action='store')
        args = parser.parse_args()
        assert re_compile('^\w{4,}[-]?\w{1,}?$').match(args.record_number)
        if args.g:
            assert exists(abspath(args.g))
            root_path = abspath(args.g)
        else:
            root_path = join(abspath('/media/sf_shared_with_ubuntu_guest_os'),
                             'mvol-%s' % args.record_number)
        scrc_input = Input(join(root_path,'mvol-%s-issues.csv' \
                                % args.record_number))
        ondisk_input = Input(join(root_path,'mvol-%s.csv' \
                                  % args.record_number))
        scrc_input = [x for x in scrc_input.data][1:]
        ondisk_input = [x for x in ondisk_input.data][1:]
        report = Output('mvol-%s-report.csv' % args.record_number)
        report.add_record(csv_headers)
        for n in scrc_input:
            was_it_digitized = 'Y'
            scrc_id = n[2]
            ondisk_record = [x for x in ondisk_input if x[0] == scrc_id][0]
            if not ondisk_record:
                stderr.write("{scrc_id} does not appear on-disk.\n". \
                             format(scrc_id = scrc_id))
                record = ['N', 'n/a', 'n/a', 'n/a']
                report.add_record(record)
                continue

            has_pdf = ondisk_record[8]
            has_structural_metadata = 'Y' if ondisk_record[6] == 'Y' or \
                                      ondisk_record[7] == 'Y' else 'N'
            is_it_valid = 'Y' if was_it_digitized == 'Y' and \
                          has_structural_metadata == 'Y' and \
                          has_pdf == 'Y' else 'N'

            issue_input = Input(join(root_path,'issues', '%s-pages.csv') \
                                % scrc_id)
            invalid_pages = [(x[2],x[3],x[4]) for x in issue_input.data \
                             if x[2] == 'N' or x[3] == 'N' or x[4] == 'N']
            if invalid_pages:
                is_it_valid = 'N'
            record = [
                scrc_id, was_it_digitized, has_structural_metadata, has_pdf,
                is_it_valid
            ]
            report.add_record(record)
        return 0
    except KeyboardInterrupt:
        return 131
def main():
    try:
        parser = ArgumentParser(description="",epilog="")
        parser.add_argument("record_number",help="",action="store", type=str)
        parser.add_argument("-g",help="",action='store')
        args = parser.parse_args()
        assert re_compile('^\w{4,}[-]?\w{1,}?$').match(args.record_number)
        if args.g:
            assert exists(abspath(args.g))
            root_path = abspath(args.g)
        else:
            root_path = join(abspath('/media/sf_shared_with_ubuntu_guest_os'),
                             'mvol-%s' % args.record_number)
        scrc_input = Input(join(root_path,'mvol-%s-issues.csv' \
                                % args.record_number))
        ondisk_input = Input(join(root_path,'mvol-%s.csv' \
                                  % args.record_number))        
        scrc_input = [x for x in scrc_input.data][1:]
        ondisk_input = [x for x in ondisk_input.data][1:]
        report = Output('mvol-%s-report.csv' % args.record_number)
        report.add_record(csv_headers)
        for n in scrc_input:
            was_it_digitized = 'Y'
            scrc_id = n[2]
            ondisk_record = [x for x in ondisk_input if x[0] == scrc_id][0]
            if not ondisk_record:
                stderr.write("{scrc_id} does not appear on-disk.\n". \
                             format(scrc_id = scrc_id))
                record = ['N','n/a','n/a','n/a']
                report.add_record(record)
                continue

            has_pdf = ondisk_record[8]            
            has_structural_metadata = 'Y' if ondisk_record[6] == 'Y' or \
                                      ondisk_record[7] == 'Y' else 'N'
            is_it_valid = 'Y' if was_it_digitized == 'Y' and \
                          has_structural_metadata == 'Y' and \
                          has_pdf == 'Y' else 'N'

            issue_input = Input(join(root_path,'issues', '%s-pages.csv') \
                                % scrc_id)
            invalid_pages = [(x[2],x[3],x[4]) for x in issue_input.data \
                             if x[2] == 'N' or x[3] == 'N' or x[4] == 'N']
            if invalid_pages:
                is_it_valid = 'N'
            record = [scrc_id, was_it_digitized,has_structural_metadata,
                      has_pdf,is_it_valid]
            report.add_record(record)
        return 0
    except KeyboardInterrupt:
        return 131
Esempio n. 3
0
def main():
    try:
        parser = ArgumentParser(description="",epilog="")
        parser.add_argument("record_number",help="",action="store")
        parser.add_argument("file_list",help="",action="store")
        args = parser.parse_args()
        g = Grouping()
        try:
            files_list = open(args.file_list, 'r').readlines()
        except:
            stderr.write("cold not open file {flist}". \
                         format(flist = args.file_list))
            return 1
        for n_file in files_list:
            fullpath_dir = dirname(n_file)
            relpath_dir = relpath(fullpath_dir, '/media/repo/repository/tr/')
            accession = relpath_dir.split('/')[0]
            i = AccessionDirectory(join('/media/repo/repository/tr/', 
                                        accession), 
                                   '/media/repo/repository/tr/')
            a = Item(n_file)
            setattr(a,'accession',accession)
            the_file_path = a.get_file_path().strip()
            canonical_file_path = the_file_path.split(accession)[1].lstrip('/')
            pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})')
            matches = pattern.search(canonical_file_path)
            if matches:
                identifier = '-'.join([matches.group(1), matches.group(2), 
                                       matches.group(3), matches.group(4)])
                files_required = generate_pattern_list(identifier)
                n = g.does_it_need_a_new_item(identifier)
                for tup in files_required:
                    pattern = re_compile(tup[1])
                    label  = tup[0]
                    search = pattern.search(a.get_file_path())
                    if search:
                        check = True
                        is_page = False
                        for ppart in page_num_file_parts:
                            page_pattern = re_compile(ppart)
                            page_search = page_pattern. \
                                          search(a.get_file_path())
                            if page_search:
                                is_page = True
                                n.add_page(a, label, page_search.group(1), 
                                           splitext(basename( \
                                                    a.get_file_path()))[0])
                        if not is_page:
                            n.add_representation(a, label)
                g.sort_items()
            else:
                logging.error("could not match file {filename}". \
                              format(filename = the_file_path))

        mvol_csv = "mvol-%s.csv" % args.record_number
        main_csv = Output(mvol_csv)
        main_csv.add_record(csv_headers)
        for n in g.items:
            if 'Thum' in n.identifier or 'tif' in n.identifier \
               or 'test' in n.identifier:
                continue
            id_parts = n.identifier.split('-')
            has_new_struct = 'N'
            has_old_struct = 'N'
            has_pdf = 'N'
            has_pages = 'Y' if getattr(n,'pages',None) else 'N'
            struct_objids = []
            txt_objids = []
            old_style = False
            new_style = False
            pdf = getattr(n, 'pdf', None)
            new_mdata = getattr(n, 'struct', None)
            old_mdata = getattr(n, 'txt', None)
            if pdf:
                has_pdf = 'Y'
            else:
                errors.get(n.identifier).append("missing {rep}". \
                                                format(rep = rep))
            if new_mdata and isinstance(new_mdata, list):
                has_new_struct = 'Y'
                struct_mdata_files = [x.get_file_path(). \
                                      strip() for x in new_mdata]
            elif new_mdata and isinstance(new_mdata, Item):
                has_new_struct = 'Y'
                struct_mdata_files = [new_mdata.get_file_path().strip()]
            elif old_mdata and isinstance(old_mdata, list):
                has_old_struct = 'Y'
                struct_mdata_files = [x.get_file_path(). \
                                      strip() for x in old_mdata]
            elif old_mdata and isinstance(old_mdata, Item):
                has_old_struct = 'Y'
                struct_mdata_files = [old_mdata.get_file_path().strip()]
            if struct_mdata_files:
                for struct_mdata_file in struct_mdata_files:
                    try:
                        fp = open(struct_mdata_file,'r')
                        lines = fp.readlines()
                        lines = [x for x in lines if len(x.split('\t')) == 3]
                        relevant_lines = lines[1:]
                        objids = [x.replace('\t','').strip('\n') \
                                  for x in relevant_lines]
                        page_difference = len(objids) - len(n.pages)
                        if has_new_struct == 'Y':
                            struct_objids = objids
                        elif has_old_struct == 'Y':
                            txt_objids = objids
                    except UnicodeDecodeError:
                        stderr.write("{mfile} couldn't be opened\n". \
                                     format(mfile = struct_mdata_file))
            record = [n.identifier, id_parts[0], id_parts[1], 
                      id_parts[2], id_parts[3], ','.join(list(n.accessions)), 
                      has_old_struct, has_new_struct, has_pdf, 
                      has_pages, len(txt_objids),
                      len(struct_objids), len(n.pages)]
            main_csv.add_record(record)
            issue_csv_file_name = "%s-pages.csv" % n.identifier
            issue_csv = Output(issue_csv_file_name)
            issue_csv.add_record(page_csv_headers)
            sorted_pages = sorted(n.pages)
            for page in sorted_pages:
                ocr_there = "Y" if getattr(page,'pos',None) or \
                            getattr(page,'alto',None) \
                            or getattr(p, 'xml',None) else "N"
                jpeg_there = "Y" if getattr(page, 'jpeg',None) else "N"
                tiff_there = "Y" if getattr(page, 'tiff',None) else "N"
                try:
                    x = page.objectpage.split('_')[1].lstrip('0')
                    object_identifier = x.zfill(8)
                except:
                    object_identifier = page.objectpage
                page_record = [object_identifier,
                               page.objectpage, n.identifier, 
                               ocr_there, jpeg_there, tiff_there]
                issue_csv.add_record(page_record)
        return 0
    except KeyboardInterrupt:
        return 131