Ejemplo n.º 1
0
def check_pubdate_without_month_selected(fields1, fields2, final_result, type_check, subfield_list, tag):
    """It checks if a pubdate without month is selected if other dates with month are present"""
    logger.info("        running check_pubdate_without_month_selected")

    # dates in format "YYYY-MM-DD"
    def has_valid_month(date_str):
        try:
            month = int(date_str[5:7])
            return month != 0
        except:
            return False

    field_with_month = 0
    # I extract only the subfields from the final result list of fields
    final_result_fields = [field[0] for field in final_result]
    for field in [field[0] for field in fields1 + fields2]:
        # non selected field
        if field not in final_result_fields:
            for subfield in field:
                # if I find a date with a valid month I increment the variable
                if subfield[0] in subfield_list:
                    if has_valid_month(subfield[1]):
                        field_with_month += 1
        # selected field
        else:
            for subfield in field:
                # if I find a date with a valid month then I return directly false
                if subfield[0] in subfield_list:
                    if has_valid_month(subfield[1]):
                        return False
    if field_with_month > 0:
        manage_check_error(
            'Date without month selected while other one with month is present in field "%s"!' % tag, type_check, logger
        )
    return None
Ejemplo n.º 2
0
def check_longer_string_not_selected(fields1, fields2, final_result, type_check, subfield_list, tag):
    """"""
    logger.info("        running check_longer_string_not_selected")

    cur_max_len = 0
    max_len_field_not_sel = False

    # I extract only the subfields from the final result list of fields
    final_result_fields = [field[0] for field in final_result]
    for field in [field[0] for field in fields1 + fields2]:
        # non selected field
        if field not in final_result_fields:
            for subfield in field:
                # if I have found a field not selected I check if its length is greater than the one I have already checked
                if subfield[0] in subfield_list:
                    if len(subfield[1]) > cur_max_len:
                        # If so I update the variables
                        cur_max_len = len(subfield[1])
                        max_len_field_not_sel = True

        # selected field
        else:
            for subfield in field:
                # if I found an unicode string in the selected field then I return directly false
                if subfield[0] in subfield_list:
                    if len(subfield[1]) > cur_max_len:
                        # If so I update the variables
                        cur_max_len = len(subfield[1])
                        max_len_field_not_sel = False
    if max_len_field_not_sel:
        manage_check_error('Longer field "%s" not selected!' % tag, type_check, logger)
    return None
Ejemplo n.º 3
0
def check_uppercase_string_selected(fields1, fields2, final_result, type_check, subfield_list, tag):
    """"""
    logger.info("        running check_uppercase_string_selected")
    # I extract the fields selected and the ones not selected
    notsel_field_lower = 0
    # I extract only the subfields from the final result list of fields
    final_result_fields = [field[0] for field in final_result]
    for field in [field[0] for field in fields1 + fields2]:
        # non selected field
        if field not in final_result_fields:
            for subfield in field:
                # if I found a lower case string I increase a counter
                if subfield[0] in subfield_list:
                    if not subfield[1].isupper():
                        notsel_field_lower += 1
        # selected field
        else:
            for subfield in field:
                # if I found an lower case string in the selected field then I return directly false
                if subfield[0] in subfield_list:
                    if not subfield[1].isupper():
                        return False
    if notsel_field_lower > 0:
        manage_check_error(
            'Upper case string selected instead of a lower case one in field "%s"!' % tag, type_check, logger
        )
    return None
Ejemplo n.º 4
0
def check_author_from_shorter_list(fields1, fields2, final_result, type_check, subfield_list, tag):
    """Simply checks that the return list of authors is the longest possible.
        This check relies on the fact that we don't merge authors from different origins, but we simply add subfields
        for the ones we selected.
    """
    logger.info("        running check_author_from_shorter_list")

    # I select the longest list
    longer_list = fields1 if len([field[0] for field in fields1]) >= len([field[0] for field in fields2]) else fields2
    # I check if the one returned is shorter than the longest, I have a problem
    if len([field[0] for field in final_result]) < len([field[0] for field in longer_list]):
        manage_check_error('Longer list of authors not selected in field "%s"!' % tag, type_check, logger)
    return None
def check_collections_existence(merged_record, type_check):
    """Function that checks if there is at least one collection"""
    logger.info('      running check_collections_existence')
    try:
        collections_fields = merged_record[FIELD_TO_MARC['collection']]
    except KeyError:
        manage_check_error('No Collection field!', type_check, logger)
        return None
    if len(collections_fields) == 0:
        manage_check_error('No Collection field!', type_check, logger)
    return None
    
    
Ejemplo n.º 6
0
def check_one_date_per_type(fields1, fields2, final_result, type_check, subfield_list, tag):
    """Function to check if there are multiple dates of the same type"""
    logger.info("        running check_one_date_per_type")

    # I extract all the dates grouped by date type
    date_types = {}
    for field in final_result:
        date_types.setdefault(bibrecord.field_get_subfield_values(field, subfield_list[0][1])[0], []).append(
            bibrecord.field_get_subfield_values(field, subfield_list[0][0])[0]
        )
    # then I check that these dates are unique per type
    for datet in date_types:
        if len(set(date_types[datet])) > 1:
            manage_check_error('Multiple dates for type "%s" in field "%s".' % (datet, tag), type_check, logger)
    return None
Ejemplo n.º 7
0
def check_duplicate_normalized_author_names(fields1, fields2, final_result, type_check, subfield_list, tag):
    """
    Checks if there are authors with the same normalized name. This will
    prevent the correct matching of authors from one author list to the other.
    """
    logger.info("        running check_duplicate_normalized_author_names")

    author_names = set()
    for field in final_result:
        author = bibrecord.field_get_subfield_values(field, AUTHOR_NORM_NAME_SUBFIELD)[0]
        if author in author_names:
            # I don't raise an error if I have duplicated normalized author names,
            # I simply return the trusted list
            manage_check_error(
                'Duplicated normalized author name for "%s" in field "%s".' % (author, tag), type_check, logger
            )
        else:
            author_names.add(author)
    return None
def check_pub_year_consistency(merged_record, type_check):
    """Function that checks if the publication year is consistent 
    with the year at the beginning of the bibcode"""
    logger.info('      running check_pub_year_consistency')
    #definition of the list of dates I don't want to check with this function
    dates_to_skip_from_check = ['date-preprint']
    try:
        system_number_fields = merged_record[FIELD_TO_MARC['system number']]
    except KeyError:
        manage_check_error('No System Number field!', type_check, logger)
        return None
    try:
        pub_dates_fields = merged_record[FIELD_TO_MARC['publication date']]
    except KeyError:
        manage_check_error('No Publication Date field!', type_check, logger)
        return None
    #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed)
    if len(system_number_fields) > 1:
        manage_check_error('There are more than one System Numbers!', type_check, logger)
        return None
    system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0]
    num_dates_checked = 0
    for date_type_string in PUBL_DATE_TYPE_VAL_SUBFIELD:
        #I don't want to check the preprint date
        if date_type_string in dates_to_skip_from_check:
            continue
        #then I have to extract the right date (there can be different in the same field)
        pubdate = ''
        for field in pub_dates_fields:
            if bibrecord.field_get_subfield_values(field, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type_string:
                pubdate =  bibrecord.field_get_subfield_values(field, PUBL_DATE_SUBFIELD)[0]
                break
        if len(pubdate) != 0:
            num_dates_checked +=1
        else:
            continue
        #final part of the check
        if pubdate[0:4] != system_number[0:4]:
            manage_check_error('Year of "%s" not consistent with the main bibcode "%s"!' % (date_type_string, system_number), type_check, logger)
    if num_dates_checked == 0:
        manage_check_error('No dates available for this record!', type_check, logger)    
    return None
Ejemplo n.º 9
0
def check_different_keywords_for_same_type(fields1, fields2, final_result, type_check, subfield_list, tag):
    """"""
    logger.info("        running check_different_keywords_for_same_type")

    # I build a data structure for the keywords of the first set
    # where I group the keywords by institution
    kewords_per_institution = {}
    for field in [field[0] for field in fields1]:
        institution = " "
        keyword_string = ""
        for subfield in field:
            if subfield[0] == KEYWORD_ORIGIN_SUBFIELD:
                institution = subfield[1]
            if subfield[0] == KEYWORD_STRING_SUBFIELD:
                keyword_string = subfield[1]
        kewords_per_institution.setdefault(institution, set()).add(keyword_string)
    # for each keyword of the other set, I check if it already exists if I already have the same system
    # if I have the same system but not the keyword, then I have a problem
    for field in [field[0] for field in fields2]:
        institution = " "
        keyword_string = ""
        for subfield in field:
            if subfield[0] == KEYWORD_ORIGIN_SUBFIELD:
                institution = subfield[1]
            if subfield[0] == KEYWORD_STRING_SUBFIELD:
                keyword_string = subfield[1]
        if institution in kewords_per_institution:
            # if I have the same institution then I have to have the the keyword already
            if len(kewords_per_institution[institution].intersection(set([keyword_string]))) == 0:
                manage_check_error(
                    'Different groups with same keyword system don\'t have the same list of keywords (field "%s")!'
                    % tag,
                    type_check,
                    logger,
                )
                break
        else:
            pass

    return None
Ejemplo n.º 10
0
def check_string_with_unicode_not_selected(fields1, fields2, final_result, type_check, subfield_list, tag):
    """ Function that checks if a string without unicode has been selected instead of one containing unicode.
        If multiple strings have been selected, then only an unicode one is enough to return false.
    """

    def is_unicode(s):
        try:
            s.decode()
            return False
        except UnicodeDecodeError:
            return True

    logger.info("        running check_string_with_unicode_not_selected")
    # I extract the fields selected and the ones not selected
    notsel_field_unicode = 0
    # I extract only the subfields from the final result list of fields
    final_result_fields = [field[0] for field in final_result]
    for field in [field[0] for field in fields1 + fields2]:
        # non selected field
        if field not in final_result_fields:
            for subfield in field:
                # if I found an unicode string I increase a counter
                if subfield[0] in subfield_list:
                    if is_unicode(subfield[1]):
                        notsel_field_unicode += 1
        # selected field
        else:
            for subfield in field:
                # if I found an unicode string in the selected field then I return directly false
                if subfield[0] in subfield_list:
                    if is_unicode(subfield[1]):
                        return False
    if notsel_field_unicode > 0:
        manage_check_error(
            'Field "%s" with unicode string not selected (all the selected fields are not unicode)!' % tag,
            type_check,
            logger,
        )
    return None
def first_author_bibcode_consistency(merged_record, type_check):
    """Function that checks if the last letter of the main bibcode 
    is consistent with the first letter of the first author"""
    logger.info('      running first_author_bibcode_consistency')
    bibstems_to_skip_from_check = ['QB']
    try:
        system_number_fields = merged_record[FIELD_TO_MARC['system number']]
    except KeyError:
        manage_check_error('No System Number field!', type_check, logger)
        return None
    try:
        first_author_fields = merged_record[FIELD_TO_MARC['first author']]
    except KeyError:
        manage_check_error('No First Author field!', type_check, logger)
        return None
    #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed)
    if len(system_number_fields) > 1:
        manage_check_error('There are more than one System Numbers!', type_check, logger)
        return None
    #the first author field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed)
    if len(first_author_fields) > 1:
        manage_check_error('There are more than one First Author!', type_check, logger)
        return None
    system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0]
    first_author = bibrecord.field_get_subfield_values(first_author_fields[0], AUTHOR_NAME_SUBFIELD)[0]
    #If the bibcode has a bibstem to skip, I don't do anything
    for elem in bibstems_to_skip_from_check:
        if system_number[4:4+len(elem)] == elem:
            return None
    if first_author[0].lower() != system_number[-1].lower():
        #if the last letter of the system number is a dot, then I want to give a different message
        if system_number[-1] == '.':
            manage_check_error('The main bibcode "%s" doesn\'t have an initial even if there is a First Author "%s"!' % (system_number, first_author), type_check, logger)
        else:
            manage_check_error('First Author "%s" not consistent with the main bibcode "%s"!' % (first_author, system_number), type_check, logger)
    return None