def validate_single_filename(abspath, filename, item, errors): """ Checks if file is accessible and matches item metadata """ if not os.path.isfile(abspath): errors.add( "File [{abspath}] is not accessible".format(abspath=abspath)) if not utils.isfile_case_sensitive(abspath): errors.add( "File [{abspath}] is not accessible in case-sensitive mode".format( abspath=abspath)) booktype = item.get("booktype") validate_periodical_filename(filename, item, errors) validate_short_desription_filename(filename, item, errors) validate_etiquette_filename(filename, item, errors) if booktype in MULTIENTRY_BOOKTYPES: return metadata = utils.extract_metadata_from_file(filename) #validating optional author, edition, tome #in case when item specifies value, but filename does not optional_meta_fields = ["author"] if booktype: optional_meta_fields += [ "edition", "volume", #For serial books, no number is present in metadata #Temporary disable check here #"number", "part" ] for meta_field in optional_meta_fields: if ((item.has(meta_field)) and (meta_field not in metadata)): errors.add( "Field {meta_field} is not specified in filename [{filename}]". format(meta_field=meta_field, filename=filename)) meta_keywords = metadata.get("keywords", {}) source_file = item.get("source_file") if ((const.META_INCOMPLETE in meta_keywords) and (source_file != "_problems.bib")): errors.add("Incomplete entries should be stored in _problems.bib") searches = utils.make_searches_from_metadata(metadata) for search_key, search_func in searches.items(): if not search_func(item): errors.add( "Item is not searchable by {search_key} extracted from filename {abspath}.\n" " Item has: {item_value}\n" " Search has: {search_value}".format( search_key=search_key, item_value=item.get(search_key), search_value=metadata[search_key], abspath=abspath))
def check_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add("Field url with value [{single_url}] and number {number} is wrong".format( single_url=single_url, number=number )) if not utils.is_url_self_served(single_url, item): continue match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: errors.add("Self served url [{single_url}] doesn't match SELF_SERVED_URL_REGEXP".format( single_url=single_url )) continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url(single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owners = metadata.get("owner").split("+") if not owners: errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format( number=number, url=single_url, filename=single_filename )) continue for owner in owners: owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: annotation = item.get("annotation") if ( (not annotation) or (owner_fullname not in annotation) ): errors.add("Owner fullname ({owner_fullname}) should be present in annotation".format( owner_fullname=owner_fullname ))
def check_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") booktype = item.get("booktype") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add("Field url with value [{single_url}] and number {number} is wrong".format( single_url=single_url, number=number )) match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: continue #inproceedings can have self-served url pointing #to entire full proceedings book #TODO: invent something like PARTIAL_BOOKTYPES if (booktype == "inproceedings"): continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url(single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owner = metadata.get("owner") if owner is None: errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format( number=number, url=single_url, filename=single_filename )) continue owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: annotation = item.get("annotation") if ( (not annotation) or (owner_fullname not in annotation) ): errors.add("Owner fullname ({owner_fullname}) should be present in annotation".format( owner_fullname=owner_fullname ))
def validate_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add("Field url with value [{single_url}] and number {number} is wrong".format( single_url=single_url, number=number )) if not utils.is_url_self_served(single_url, item): continue match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: errors.add("Self served url [{single_url}] doesn't match SELF_SERVED_URL_REGEXP".format( single_url=single_url )) continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url(single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owners = metadata.get("owner").split("+") if not owners: errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format( number=number, url=single_url, filename=single_filename )) continue for owner in owners: owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: note = item.get("note") if ( (not note) or (owner_fullname not in note) ): errors.add("Owner fullname ({owner_fullname}) should be present in note".format( owner_fullname=owner_fullname ))
def check_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") booktype = item.get("booktype") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add( "Field url with value [{single_url}] and number {number} is wrong" .format(single_url=single_url, number=number)) match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: continue #inproceedings can have self-served url pointing #to entire full proceedings book #TODO: invent something like PARTIAL_BOOKTYPES if (booktype == "inproceedings"): continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url( single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owner = metadata.get("owner") if owner is None: errors.add( "Owner specification expected for self-served url #{number} (url={url}, filename={filename})" .format(number=number, url=single_url, filename=single_filename)) continue owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: annotation = item.get("annotation") if ((not annotation) or (owner_fullname not in annotation)): errors.add( "Owner fullname ({owner_fullname}) should be present in annotation" .format(owner_fullname=owner_fullname))
def check_single_filename(abspath, filename, item, errors): """ Checks if file is accessible and matches item metadata """ if not os.path.isfile(abspath): errors.add("File [{abspath}] is not accessible".format( abspath=abspath )) if not utils.isfile_case_sensitive(abspath): errors.add("File [{abspath}] is not accessible in case-sensitive mode".format( abspath=abspath )) booktype = item.get("booktype") check_periodical_filename(filename, item, errors) check_short_desription_filename(filename, item, errors) check_etiquette_filename(filename, item, errors) if booktype in MULTIENTRY_BOOKTYPES: return metadata = utils.extract_metadata_from_file(filename) #validating optional author, edition, tome #in case when item specifies value, but filename does not optional_meta_fields = [ "author" ] if booktype: optional_meta_fields += [ "edition", "volume", #For serial books, no number is present in metadata #Temporary disable check here #"number", "part" ] for meta_field in optional_meta_fields: if ( (item.has(meta_field)) and (meta_field not in metadata) ): errors.add("Field {meta_field} is not specified in filename [{filename}]".format( meta_field=meta_field, filename=filename )) meta_keywords = metadata.get("keywords", {}) source_file = item.get("source_file") if ( (const.META_INCOMPLETE in meta_keywords) and (source_file != "_problems.bib") ): errors.add("Incomplete entries should be stored in _problems.bib") searches = utils.make_searches_from_metadata(metadata) for search_key, search_func in searches.items(): if not search_func(item): errors.add( "Item is not searchable by {search_key} extracted from filename {abspath}.\n" " Item has: {item_value}\n" " Search has: {search_value}".format( search_key=search_key, item_value=item.get(search_key), search_value=metadata[search_key], abspath=abspath ))
def main( root=("r", "", "E-library root"), check_head=("", False, "Perform HTTP HEAD request to url values") ): """ Validates bibliography over a bunch of rules """ if (len(root) == 0) or (not os.path.isdir(root)): print("Root folder is inaccessible") sys.exit(1) root = os.path.abspath(root) print("Going to process {0} items".format(len(items))) SOURCE_REGEXP = re.compile("(?P<basename>[_\-\w\.]+).bib:\d+") MULTILANG_FILES = {"proceedings-spb", "proceedings-rothenfelser", "_missing", "_problems"} VALID_BOOKTYPES = { "book", "mvbook", "inproceedings", "proceedings", "reference", "mvreference", "periodical", "unpublished", "thesis", "article" } NON_MULTIVOLUME_BOOKTYPES = {"article", "periodical"} MULTIVOLUME_BOOKTYPES = {"mvbook", "mvreference"} #don't validate filename for the given entrytypes MULTIENTRY_BOOKTYPES = {"proceedings", "inproceedings"} SHORTHAND_LIMIT = 25 #magic constant LAST_ORIGINAL_YEAR = 1937 NON_ORIGINAL_KEYWORDS = {"reissue", "research"} RESEARCH_BOOKTYPES = {"book", "mvbook"} UNPUBLISHED_NOTE_PREFIX = "Unpublished manuscript" erroneous_entries = 0 errors_count = 0 for item in items: errors = [] #datamodel validation author = item.get("author") booktype = item.get("booktype") booktitle = item.get("booktitle") commentator = item.get("commentator") edition = item.get("edition") filename = item.get("filename") id = item.get("id") isbn = item.get("isbn") institution = item.get("institution") journaltitle = item.get("journaltitle") keywords = set(item.get("keywords")) if item.get("keywords") else None langid = item.get("langid") location = item.get("location") note = item.get("note") number = item.get("number") origlanguage = item.get("origlanguage") publisher = item.get("publisher") series = item.get("series") shorthand = item.get("shorthand") source = item.get("source") title = item.get("title") translator = item.get("translator") type = item.get("type") url = item.get("url") volume = item.get("volume") volumes = item.get("volumes") year = item.get("year") year_from = item.get("year_from") year_to = item.get("year_to") year_circa = item.get("year_circa") added_on = item.get("added_on") match = SOURCE_REGEXP.match(source) if not match: raise RuntimeError("Failed to parse 'source' for item ({id})".format( id=id )) source_basename = match.group("basename") parser_obligatory = [id, booktype, source, year_from, year_to, year_circa] none_checker = lambda obj: obj is not None if not all(map(none_checker, parser_obligatory)): raise RuntimeError("Parser hasn't generated all required auxiliary fields {fields}".format( fields=parser_obligatory )) general_obligatory = [langid, year, title, added_on] if not all(general_obligatory): errors.append("Item doesn't define one of [langid, year, title]") translation_obligatory = [origlanguage, translator] if not utils.all_or_none(translation_obligatory): errors.append("All of [origlanguage, translator] must be present for translations") series_obligatory = [series, number] if not utils.all_or_none(series_obligatory): errors.append("All of [series, number] must be present for serial books") if not any([author, shorthand]): errors.append("'author' or 'shorthand' must be present") if (publisher is not None) and (location is None): errors.append("If publisher present, location must be present") #booktype validation booktype = booktype.lower() if booktype not in VALID_BOOKTYPES: errors.append("Invalid booktype ({booktype})".format( booktype=booktype )) if (booktype not in NON_MULTIVOLUME_BOOKTYPES): if (volume is not None) and (volumes is None): errors.append("If volume present, volumes must be present") if (booktype in MULTIVOLUME_BOOKTYPES): if volumes is None: errors.append("volumes must be present for @{0}".format(booktype)) if (booktype == "article"): if journaltitle is None: errors.append("journaltitle must be present for @article") if (booktype == "inproceedings"): if booktitle is None: errors.append("bootitle must be present for @inprocessing") if (booktype == "thesis"): if url is None: errors.append("url must be present for @thesis") if type is None: errors.append("type must be present for @thesis") if institution is None: errors.append("institution must be present for @thesis") #data validation #author validation empty #booktitle validation empty #commentator if commentator is not None: if (keywords is None) or ("commentary" not in keywords): errors.append("Keywords should contain 'commentary' when commentator specified") #filename validation if edition is not None: #edition should be greater than 1 if edition <= 1: errors.append("Wrong edition {edition}".format( edition=edition )) if volume is not None: #volume should be positive integer if volume <= 0: errors.append("Wrong volume {volume}".format( volume=volume )) if volumes is not None: if volume > volumes: errors.append("Volume ({volume}) can't be greater than volumes ({volumes})".format( volume=volume, volumes=volumes )) #filename validation if (filename is not None) and (booktype not in MULTIENTRY_BOOKTYPES): for filename_ in filename: #filename starts with "/" which will mix os.path.join up abspath = os.path.join(root, filename_[1:]) #each filename should be accessible if not os.path.isfile(abspath): errors.append("File {filename_} is not accessible".format( filename_=filename_ )) #item should be searchable by its filename metadata metadata = utils.extract_metadata_from_file(filename_) #validating optional author, edition, tome #in case when item specifies value, but filename doesn't if not utils.all_or_none([metadata.get("author", None), author]): errors.append("File {filename_} and entry have different author specifications".format( filename_=filename_ )) if not utils.all_or_none([metadata.get("edition", None), edition]): errors.append("File {filename_} and entry have different edition specifications".format( filename_=filename_ )) if not utils.all_or_none([metadata.get("tome", None), any([volume, volumes])]): errors.append("File {filename_} and entry have different volume specifications".format( filename_=filename_ )) meta_keywords = metadata.get("keywords", None) if meta_keywords is not None: if ("incomplete" not in meta_keywords) and (source_basename == "_problems"): errors.append("Incomplete books should be stored in _problems.bib") meta_keywords.discard("incomplete") if len(meta_keywords) > 0: if keywords is None: errors.append("No keywords specified (should be {meta_keywords}".format( meta_keywords=meta_keywords )) elif not keywords >= meta_keywords: errors.append("Item keywords {keywords} do not match metadata keywords {meta_keywords}".format( keywords=keywords, meta_keywords=meta_keywords )) search_ = utils.create_search_from_metadata(metadata) if not search_(item): errors.append("File {filename_} is not searchable by extracted params".format( filename_=filename_ )) #id validation empty if len(item_index["id"][id]) != 1: errors.append("Id is not unique") #isbn validation if isbn is not None: for isbn_ in isbn: correct, msg = utils.is_isbn_valid(isbn_) if not correct: errors.append("ISBN {isbn_} isn't valid: {msg}".format( isbn_=isbn_, msg=msg )) #institution validation empty #journaltitle validation empty #keywords validation #if item was issued after LAST_ORIGINAL_YEAR, it should define keywords if True: if (year_from > LAST_ORIGINAL_YEAR) and (booktype in RESEARCH_BOOKTYPES): if (keywords is None) or (len(keywords & NON_ORIGINAL_KEYWORDS) == 0): errors.append("Item was issued after {last_year}, but keywords don't define any of {keywords}".format( last_year=LAST_ORIGINAL_YEAR, keywords=NON_ORIGINAL_KEYWORDS )) if (keywords is not None): if ("translation" in keywords) and not all([translator, origlanguage]): errors.append("When 'translation' keyword specified, translator and origlanguage should be present") if ("commentary" in keywords) and not commentator: errors.append("When 'commentary' keyword specified, commentator should be present") #langid validation if source_basename not in MULTILANG_FILES: source_lang = const.LONG_LANG_MAP[source_basename] #item language should match source language if langid != source_lang: errors.append("Source language ({source_lang}) doesn't match item language ({langid})".format( source_lang=source_lang, langid=langid )) #location validation empty #note validation note_unpublished = (note is not None) and (note.startswith(UNPUBLISHED_NOTE_PREFIX)) booktype_unpublished = (booktype == "unpublished") if not utils.all_or_none([note_unpublished, booktype_unpublished]): errors.append("For unpublished books, note should begin with [{note_prefix}] and booktype should be {booktype}".format( booktype="unpublished", note_prefix=UNPUBLISHED_NOTE_PREFIX )) #number validation empty #origlanguage validation empty #publisher validation empty #series validation empty #shorthand validation empty if shorthand is not None: length = len(shorthand) if length > SHORTHAND_LIMIT: errors.append("The length of shorthand ({length}) should not exceed limit ({limit})".format( length=length, limit=SHORTHAND_LIMIT )) if (author is None) and (not title.startswith(shorthand)): errors.append("Title ({title}) should begin with from shorthand ({shorthand})".format( title=title, shorthand=shorthand )) #source validation empty #title validation empty if title is not None: if (" " in title): errors.append("Consecutive spaces in title") if ("\t" in title): errors.append("Tabs in title") if title.startswith(" ") or title.endswith(" "): errors.append("Title isn't stripped") #translator validation if translator is not None: if (keywords is None) or ("translation" not in keywords): errors.append("Keywords should contain 'translation' when 'translator' field specified") #type validation empty #url validation empty if url is not None: correct, msg = utils.is_url_valid(url, check_head) if not correct: errors.append("URL {url} isn't valid: {msg}".format( url=url, msg=msg )) #volume validation empty #volumes validation empty #year validation empty #printing errors if len(errors) > 0: erroneous_entries += 1 errors_count += len(errors) print("Errors for {id} ({source})".format( id=id, source=source )) for error in errors: print(" " + error) print("Found {entries} erroneous entries ({errors} errors)".format( entries=erroneous_entries, errors=errors_count ))
def main( max_count=("c", 100, "Maximum count of filenames to display"), root=("r", "", "E-library root") ): if (len(root) == 0) or (not os.path.isdir(root)): print("Root folder is inaccessible") sys.exit(1) root = os.path.abspath(root) #filename in database is relative, but begins from / file_modifier = lambda file_, root=root: os.path.join(root, file_[1:]) filenames = set(map(file_modifier, item_index["filename"].keys())) files = utils.files_in_folder(root, "*.pdf", excludes=EXCLUDED_FOLDERS) files_filter = lambda file_: file_ not in filenames files = list(filter(files_filter, files)) print("Going to process {0} items".format(len(items))) print("Going to process {0} files".format(len(files))) output_count = 0 output_dict = dict() for file_ in files: relpath = "/" + os.path.relpath(file_, root) metadata = utils.extract_metadata_from_file(file_) item_search = utils.create_search_from_metadata(metadata) found_items = list(filter(item_search, items)) found_count = len(found_items) if found_count == 0: print("Nothing found for file '{relpath}'".format( relpath=relpath, )) elif found_count == 1: item = found_items[0] if item in output_dict: output_dict[item].add(relpath) else: output_dict[item] = set([relpath]) else: source_getter = lambda item: item.source() print("Found multiple items for '{relpath}':\n\t{sources}".format( sources=list(map(source_getter, found_items)), relpath=relpath )) output_count += 1 if output_count >= max_count: print("Reached maxcount. Exiting") break sort_key = lambda pair: pair[0].source() for item, paths in sorted(output_dict.items(), key=sort_key): print("Filename for {id} ({source}):".format( id=item.id(), source=item.source(), )) print("filename = {{{relpath}}}".format( relpath=" {0} ".format(config.parser.list_sep).join(sorted(paths)) )) sort_key = lambda item: item.source() if len(items) < max_count: for item in sorted(items, key=sort_key): print("Item isn't digitized: {id} ({source})".format( id=item.id(), source=item.source()))