Beispiel #1
0
 def test_findDuplicateTitles_Lowercase(self):
     title = 'Towards a New Test Environment'
     entryString = getStringEntries([{FIELD_TITLE: title},
                                     {FIELD_TITLE: title.lower()},
                                     ])
     goldKeys = ['{}{}'.format(DEFAULT_KEY_START, i) for i in range(2)]
     goldTitle2keys = {title.lower(): goldKeys}
     duplicateTitle2entries = nanny.findDuplicateTitles(parse(entryString))
     duplicateTitle2keys = {title: [e.key for e in entries] for title, entries in duplicateTitle2entries.items()}
     self.assertEqual(duplicateTitle2keys, goldTitle2keys)
Beispiel #2
0
 def test_findDuplicateTitles_Braces2(self):
     title = 'Towards a {N}ew {T}est {E}nvironment'
     unbraced_title = title.replace('{', '').replace('}', '')
     entryString = getStringEntries([{FIELD_TITLE: title},
                                     {FIELD_TITLE: unbraced_title},
                                     ])
     goldKeys = ['{}{}'.format(DEFAULT_KEY_START, i) for i in range(2)]
     goldTitle2keys = {unbraced_title.lower(): goldKeys}
     duplicateTitle2entries = nanny.findDuplicateTitles(parse(entryString))
     duplicateTitle2keys = {title: [e.key for e in entries] for title, entries in duplicateTitle2entries.items()}
     self.assertEqual(duplicateTitle2keys, goldTitle2keys)
def checkConsistency(entries, config):
    # Check for Duplicates #
    # Duplicate keys
    if config.duplicateKeys:
        print(NOT_IMPLEMENTED_PATTERN.format("Duplicate Keys"))
        # duplicateKeys = nanny.findDuplicateKeys(entries)
        # if duplicateKeys:
        #     print(HEADLINE_PATTERN.format("Duplicate Keys"))
        #     for duplicateKey in duplicateKeys:
        #         print("Found duplicate key:".format(duplicateKey))
        #     print()

    # Duplicate titles
    # Todo: Add handling of acceptable cases, such as different editions of a book, preprints and talks.
    if config.duplicateTitles:
        title2duplicateEntries = nanny.findDuplicateTitles(entries, config.duplicateTitlesIgnoredTypes)
        if title2duplicateEntries:
            print(HEADLINE_PATTERN.format("Duplicate Titles"))
            for duplicateTitle, duplicateTitleEntries in title2duplicateEntries.items():
                keysString = getEnumerationString(duplicateTitleEntries)
                firstTitle = duplicateTitleEntries[0][nanny.FIELD_TITLE]
                print("Entries {} have the same title: {}".format(keysString, firstTitle))
            print()

    # Missing fields #
    if config.anyMissingFields:
        key2availability = nanny.getFieldAvailabilities(entries)
        if key2availability:
            print(HEADLINE_PATTERN.format("Missing fields"))
            for key, availability in key2availability.items():
                missingRequiredFields = availability[nanny.FIELD_IS_REQUIRED_MISSING]
                missingOptionalFields = availability[nanny.FIELD_IS_OPTIONAL_MISSING]

                if config.anyMissingFields and (missingRequiredFields or missingOptionalFields):
                    print("Entry {}".format(key))
                    if config.missingRequiredFields and missingRequiredFields:
                        print("  Required missing: ", ', '.join(missingRequiredFields))
                    if config.missingOptionalFields and missingOptionalFields:
                        print("  Optional missing: ", ', '.join(missingOptionalFields))
            print()

    # Bad Formatting #
    # Unsecured uppercase characters in titles
    # Todo: Identify over-eager use of curly braces, e.g. across multiple words
    # Todo: Add option to prefer braces around full words instead of single characters
    # Todo: Improve search of unsecured characters to not break when double braces are used
    if config.unsecuredTitleChars:
        key2unsecuredChars = nanny.findUnsecuredUppercase(entries, field="title")
        if key2unsecuredChars:
            print(HEADLINE_PATTERN.format("Titles with uppercase characters that are not secured by curly braces"))
            for key in key2unsecuredChars:
                title = entries[key][nanny.FIELD_TITLE]
                print("Entry {} has unsecured uppercase characters: {}".format(key, title))
            print()

    # Unnecessary curly braces
    if config.unnecessaryBraces:
        print(NOT_IMPLEMENTED_PATTERN.format("unnecessary curly braces"))

    # Bad page numbers
    if config.badPageNumbers:
        badPageNumberEntries = nanny.findBadPageNumbers(entries, tolerateSingleHyphens=False)
        if badPageNumberEntries:
            print(HEADLINE_PATTERN.format("Entries with badly formatted page numbers"))
            for entry in badPageNumberEntries:
                print("Entry {} has bad page number format: {}".format(entry.key, entry[nanny.FIELD_PAGES]))
            print()

    # Inconsistent Formatting #
    # Inconsistent names for conferences
    if config.inconsistentConferences:
        print(NOT_IMPLEMENTED_PATTERN.format("inconsistent names for conferences"))

    # Incomplete name formatting (e.g. first name is initials only or missing middle names found in other entry)
    if config.incompleteNames:
        print(NOT_IMPLEMENTED_PATTERN.format("incomplete name formatting"))

    # Ambiguous name formatting (i.e. not following the "LAST, FIRST and LAST, FIRST" format)
    if config.ambiguousNames:
        print(NOT_IMPLEMENTED_PATTERN.format("ambigous name formatting"))

    # All-caps name formatting
    if config.allcapsNames:
        for field in nanny.PERSON_NAME_FIELDS:
            entrykey2CapsNames = nanny.findAllCapsName(entries, field)
            if entrykey2CapsNames:
                print(HEADLINE_PATTERN.format("{}s whose names are all-caps".format(field.capitalize())))
                for key, capsnames in entrykey2CapsNames.items():
                    for capsname in capsnames:
                        print("Entry {} has {}s which are all-caps: {}".format(key, field, capsname.pretty()))
                print()

    # Inconsistent location names
    if config.inconsistentLocations:
        print(NOT_IMPLEMENTED_PATTERN.format("inconsistent location names"))

    # Inconsistent inferrable information
    if config.inconsistentInferrableInfo:
        print(NOT_IMPLEMENTED_PATTERN.format("inconsistent inferrable information"))
Beispiel #4
0
def checkConsistency(entries, config):
    # Check for Duplicates #
    # Duplicate keys
    if config.duplicateKeys:
        print(NOT_IMPLEMENTED_PATTERN.format("Duplicate Keys"))
        # duplicateKeys = nanny.findDuplicateKeys(entries)
        # if duplicateKeys:
        #     print(HEADLINE_PATTERN.format("Duplicate Keys"))
        #     for duplicateKey in duplicateKeys:
        #         print("Found duplicate key:".format(duplicateKey))
        #     print()

    # Duplicate titles
    # Todo: Add handling of acceptable cases, such as different editions of a book, preprints and talks.
    if config.duplicateTitles:
        title2duplicateEntries = nanny.findDuplicateTitles(
            entries, config.duplicateTitlesIgnoredTypes)
        if title2duplicateEntries:
            print(HEADLINE_PATTERN.format("Duplicate Titles"))
            for duplicateTitle, duplicateTitleEntries in title2duplicateEntries.items(
            ):
                keysString = getEnumerationString(duplicateTitleEntries)
                firstTitle = duplicateTitleEntries[0][nanny.FIELD_TITLE]
                print("Entries {} have the same title: {}".format(
                    keysString, firstTitle))
            print()

    # Missing fields #
    if config.anyMissingFields:
        key2availability = nanny.getFieldAvailabilities(entries)
        if key2availability:
            print(HEADLINE_PATTERN.format("Missing fields"))
            for key, availability in key2availability.items():
                missingRequiredFields = availability[
                    nanny.FIELD_IS_REQUIRED_MISSING]
                missingOptionalFields = availability[
                    nanny.FIELD_IS_OPTIONAL_MISSING]

                if config.anyMissingFields and (missingRequiredFields
                                                or missingOptionalFields):
                    print("Entry {}".format(key))
                    if config.missingRequiredFields and missingRequiredFields:
                        print("  Required missing: ",
                              ', '.join(missingRequiredFields))
                    if config.missingOptionalFields and missingOptionalFields:
                        print("  Optional missing: ",
                              ', '.join(missingOptionalFields))
            print()

    # Bad Formatting #
    # Unsecured uppercase characters in titles
    # Todo: Identify over-eager use of curly braces, e.g. across multiple words
    # Todo: Add option to prefer braces around full words instead of single characters
    # Todo: Improve search of unsecured characters to not break when double braces are used
    if config.unsecuredTitleChars:
        key2unsecuredChars = nanny.findUnsecuredUppercase(entries,
                                                          field="title")
        if key2unsecuredChars:
            print(
                HEADLINE_PATTERN.format(
                    "Titles with uppercase characters that are not secured by curly braces"
                ))
            for key in key2unsecuredChars:
                title = entries[key][nanny.FIELD_TITLE]
                print("Entry {} has unsecured uppercase characters: {}".format(
                    key, title))
            print()

    # Unnecessary curly braces
    if config.unnecessaryBraces:
        print(NOT_IMPLEMENTED_PATTERN.format("unnecessary curly braces"))

    # Bad page numbers
    if config.badPageNumbers:
        badPageNumberEntries = nanny.findBadPageNumbers(
            entries, tolerateSingleHyphens=False)
        if badPageNumberEntries:
            print(
                HEADLINE_PATTERN.format(
                    "Entries with badly formatted page numbers"))
            for entry in badPageNumberEntries:
                print("Entry {} has bad page number format: {}".format(
                    entry.key, entry[nanny.FIELD_PAGES]))
            print()

    # Inconsistent Formatting #
    # Inconsistent names for conferences
    if config.inconsistentConferences:
        print(
            NOT_IMPLEMENTED_PATTERN.format(
                "inconsistent names for conferences"))

    # Incomplete name formatting (e.g. first name is initials only or missing middle names found in other entry)
    if config.incompleteNames:
        print(NOT_IMPLEMENTED_PATTERN.format("incomplete name formatting"))

    # Ambiguous name formatting (i.e. not following the "LAST, FIRST and LAST, FIRST" format)
    if config.ambiguousNames:
        print(NOT_IMPLEMENTED_PATTERN.format("ambigous name formatting"))

    # All-caps name formatting
    if config.allcapsNames:
        for field in nanny.PERSON_NAME_FIELDS:
            entrykey2CapsNames = nanny.findAllCapsName(entries, field)
            if entrykey2CapsNames:
                print(
                    HEADLINE_PATTERN.format(
                        "{}s whose names are all-caps".format(
                            field.capitalize())))
                for key, capsnames in entrykey2CapsNames.items():
                    for capsname in capsnames:
                        print("Entry {} has {}s which are all-caps: {}".format(
                            key, field, capsname.pretty()))
                print()

    # Inconsistent location names
    if config.inconsistentLocations:
        print(NOT_IMPLEMENTED_PATTERN.format("inconsistent location names"))

    # Inconsistent inferrable information
    if config.inconsistentInferrableInfo:
        print(
            NOT_IMPLEMENTED_PATTERN.format(
                "inconsistent inferrable information"))