def main( root=("r", "", "E-library root"), check_head=("", False, "Perform HTTP HEAD request to url values") ): """ Validates bibliography over a bunch of rules """ if (len(root) == 0) or (not os.path.isdir(root)): print("Root folder is inaccessible") sys.exit(1) root = os.path.abspath(root) print("Going to process {0} items".format(len(items))) SOURCE_REGEXP = re.compile("(?P<basename>[_\-\w\.]+).bib:\d+") MULTILANG_FILES = {"proceedings-spb", "proceedings-rothenfelser", "_missing", "_problems"} VALID_BOOKTYPES = { "book", "mvbook", "inproceedings", "proceedings", "reference", "mvreference", "periodical", "unpublished", "thesis", "article" } NON_MULTIVOLUME_BOOKTYPES = {"article", "periodical"} MULTIVOLUME_BOOKTYPES = {"mvbook", "mvreference"} #don't validate filename for the given entrytypes MULTIENTRY_BOOKTYPES = {"proceedings", "inproceedings"} SHORTHAND_LIMIT = 25 #magic constant LAST_ORIGINAL_YEAR = 1937 NON_ORIGINAL_KEYWORDS = {"reissue", "research"} RESEARCH_BOOKTYPES = {"book", "mvbook"} UNPUBLISHED_NOTE_PREFIX = "Unpublished manuscript" erroneous_entries = 0 errors_count = 0 for item in items: errors = [] #datamodel validation author = item.get("author") booktype = item.get("booktype") booktitle = item.get("booktitle") commentator = item.get("commentator") edition = item.get("edition") filename = item.get("filename") id = item.get("id") isbn = item.get("isbn") institution = item.get("institution") journaltitle = item.get("journaltitle") keywords = set(item.get("keywords")) if item.get("keywords") else None langid = item.get("langid") location = item.get("location") note = item.get("note") number = item.get("number") origlanguage = item.get("origlanguage") publisher = item.get("publisher") series = item.get("series") shorthand = item.get("shorthand") source = item.get("source") title = item.get("title") translator = item.get("translator") type = item.get("type") url = item.get("url") volume = item.get("volume") volumes = item.get("volumes") year = item.get("year") year_from = item.get("year_from") year_to = item.get("year_to") year_circa = item.get("year_circa") added_on = item.get("added_on") match = SOURCE_REGEXP.match(source) if not match: raise RuntimeError("Failed to parse 'source' for item ({id})".format( id=id )) source_basename = match.group("basename") parser_obligatory = [id, booktype, source, year_from, year_to, year_circa] none_checker = lambda obj: obj is not None if not all(map(none_checker, parser_obligatory)): raise RuntimeError("Parser hasn't generated all required auxiliary fields {fields}".format( fields=parser_obligatory )) general_obligatory = [langid, year, title, added_on] if not all(general_obligatory): errors.append("Item doesn't define one of [langid, year, title]") translation_obligatory = [origlanguage, translator] if not utils.all_or_none(translation_obligatory): errors.append("All of [origlanguage, translator] must be present for translations") series_obligatory = [series, number] if not utils.all_or_none(series_obligatory): errors.append("All of [series, number] must be present for serial books") if not any([author, shorthand]): errors.append("'author' or 'shorthand' must be present") if (publisher is not None) and (location is None): errors.append("If publisher present, location must be present") #booktype validation booktype = booktype.lower() if booktype not in VALID_BOOKTYPES: errors.append("Invalid booktype ({booktype})".format( booktype=booktype )) if (booktype not in NON_MULTIVOLUME_BOOKTYPES): if (volume is not None) and (volumes is None): errors.append("If volume present, volumes must be present") if (booktype in MULTIVOLUME_BOOKTYPES): if volumes is None: errors.append("volumes must be present for @{0}".format(booktype)) if (booktype == "article"): if journaltitle is None: errors.append("journaltitle must be present for @article") if (booktype == "inproceedings"): if booktitle is None: errors.append("bootitle must be present for @inprocessing") if (booktype == "thesis"): if url is None: errors.append("url must be present for @thesis") if type is None: errors.append("type must be present for @thesis") if institution is None: errors.append("institution must be present for @thesis") #data validation #author validation empty #booktitle validation empty #commentator if commentator is not None: if (keywords is None) or ("commentary" not in keywords): errors.append("Keywords should contain 'commentary' when commentator specified") #filename validation if edition is not None: #edition should be greater than 1 if edition <= 1: errors.append("Wrong edition {edition}".format( edition=edition )) if volume is not None: #volume should be positive integer if volume <= 0: errors.append("Wrong volume {volume}".format( volume=volume )) if volumes is not None: if volume > volumes: errors.append("Volume ({volume}) can't be greater than volumes ({volumes})".format( volume=volume, volumes=volumes )) #filename validation if (filename is not None) and (booktype not in MULTIENTRY_BOOKTYPES): for filename_ in filename: #filename starts with "/" which will mix os.path.join up abspath = os.path.join(root, filename_[1:]) #each filename should be accessible if not os.path.isfile(abspath): errors.append("File {filename_} is not accessible".format( filename_=filename_ )) #item should be searchable by its filename metadata metadata = utils.extract_metadata_from_file(filename_) #validating optional author, edition, tome #in case when item specifies value, but filename doesn't if not utils.all_or_none([metadata.get("author", None), author]): errors.append("File {filename_} and entry have different author specifications".format( filename_=filename_ )) if not utils.all_or_none([metadata.get("edition", None), edition]): errors.append("File {filename_} and entry have different edition specifications".format( filename_=filename_ )) if not utils.all_or_none([metadata.get("tome", None), any([volume, volumes])]): errors.append("File {filename_} and entry have different volume specifications".format( filename_=filename_ )) meta_keywords = metadata.get("keywords", None) if meta_keywords is not None: if ("incomplete" not in meta_keywords) and (source_basename == "_problems"): errors.append("Incomplete books should be stored in _problems.bib") meta_keywords.discard("incomplete") if len(meta_keywords) > 0: if keywords is None: errors.append("No keywords specified (should be {meta_keywords}".format( meta_keywords=meta_keywords )) elif not keywords >= meta_keywords: errors.append("Item keywords {keywords} do not match metadata keywords {meta_keywords}".format( keywords=keywords, meta_keywords=meta_keywords )) search_ = utils.create_search_from_metadata(metadata) if not search_(item): errors.append("File {filename_} is not searchable by extracted params".format( filename_=filename_ )) #id validation empty if len(item_index["id"][id]) != 1: errors.append("Id is not unique") #isbn validation if isbn is not None: for isbn_ in isbn: correct, msg = utils.is_isbn_valid(isbn_) if not correct: errors.append("ISBN {isbn_} isn't valid: {msg}".format( isbn_=isbn_, msg=msg )) #institution validation empty #journaltitle validation empty #keywords validation #if item was issued after LAST_ORIGINAL_YEAR, it should define keywords if True: if (year_from > LAST_ORIGINAL_YEAR) and (booktype in RESEARCH_BOOKTYPES): if (keywords is None) or (len(keywords & NON_ORIGINAL_KEYWORDS) == 0): errors.append("Item was issued after {last_year}, but keywords don't define any of {keywords}".format( last_year=LAST_ORIGINAL_YEAR, keywords=NON_ORIGINAL_KEYWORDS )) if (keywords is not None): if ("translation" in keywords) and not all([translator, origlanguage]): errors.append("When 'translation' keyword specified, translator and origlanguage should be present") if ("commentary" in keywords) and not commentator: errors.append("When 'commentary' keyword specified, commentator should be present") #langid validation if source_basename not in MULTILANG_FILES: source_lang = const.LONG_LANG_MAP[source_basename] #item language should match source language if langid != source_lang: errors.append("Source language ({source_lang}) doesn't match item language ({langid})".format( source_lang=source_lang, langid=langid )) #location validation empty #note validation note_unpublished = (note is not None) and (note.startswith(UNPUBLISHED_NOTE_PREFIX)) booktype_unpublished = (booktype == "unpublished") if not utils.all_or_none([note_unpublished, booktype_unpublished]): errors.append("For unpublished books, note should begin with [{note_prefix}] and booktype should be {booktype}".format( booktype="unpublished", note_prefix=UNPUBLISHED_NOTE_PREFIX )) #number validation empty #origlanguage validation empty #publisher validation empty #series validation empty #shorthand validation empty if shorthand is not None: length = len(shorthand) if length > SHORTHAND_LIMIT: errors.append("The length of shorthand ({length}) should not exceed limit ({limit})".format( length=length, limit=SHORTHAND_LIMIT )) if (author is None) and (not title.startswith(shorthand)): errors.append("Title ({title}) should begin with from shorthand ({shorthand})".format( title=title, shorthand=shorthand )) #source validation empty #title validation empty if title is not None: if (" " in title): errors.append("Consecutive spaces in title") if ("\t" in title): errors.append("Tabs in title") if title.startswith(" ") or title.endswith(" "): errors.append("Title isn't stripped") #translator validation if translator is not None: if (keywords is None) or ("translation" not in keywords): errors.append("Keywords should contain 'translation' when 'translator' field specified") #type validation empty #url validation empty if url is not None: correct, msg = utils.is_url_valid(url, check_head) if not correct: errors.append("URL {url} isn't valid: {msg}".format( url=url, msg=msg )) #volume validation empty #volumes validation empty #year validation empty #printing errors if len(errors) > 0: erroneous_entries += 1 errors_count += len(errors) print("Errors for {id} ({source})".format( id=id, source=source )) for error in errors: print(" " + error) print("Found {entries} erroneous entries ({errors} errors)".format( entries=erroneous_entries, errors=errors_count ))
def main( max_count=("c", 100, "Maximum count of filenames to display"), root=("r", "", "E-library root") ): if (len(root) == 0) or (not os.path.isdir(root)): print("Root folder is inaccessible") sys.exit(1) root = os.path.abspath(root) #filename in database is relative, but begins from / file_modifier = lambda file_, root=root: os.path.join(root, file_[1:]) filenames = set(map(file_modifier, item_index["filename"].keys())) files = utils.files_in_folder(root, "*.pdf", excludes=EXCLUDED_FOLDERS) files_filter = lambda file_: file_ not in filenames files = list(filter(files_filter, files)) print("Going to process {0} items".format(len(items))) print("Going to process {0} files".format(len(files))) output_count = 0 output_dict = dict() for file_ in files: relpath = "/" + os.path.relpath(file_, root) metadata = utils.extract_metadata_from_file(file_) item_search = utils.create_search_from_metadata(metadata) found_items = list(filter(item_search, items)) found_count = len(found_items) if found_count == 0: print("Nothing found for file '{relpath}'".format( relpath=relpath, )) elif found_count == 1: item = found_items[0] if item in output_dict: output_dict[item].add(relpath) else: output_dict[item] = set([relpath]) else: source_getter = lambda item: item.source() print("Found multiple items for '{relpath}':\n\t{sources}".format( sources=list(map(source_getter, found_items)), relpath=relpath )) output_count += 1 if output_count >= max_count: print("Reached maxcount. Exiting") break sort_key = lambda pair: pair[0].source() for item, paths in sorted(output_dict.items(), key=sort_key): print("Filename for {id} ({source}):".format( id=item.id(), source=item.source(), )) print("filename = {{{relpath}}}".format( relpath=" {0} ".format(config.parser.list_sep).join(sorted(paths)) )) sort_key = lambda item: item.source() if len(items) < max_count: for item in sorted(items, key=sort_key): print("Item isn't digitized: {id} ({source})".format( id=item.id(), source=item.source()))