Beispiel #1
0
 def test_check_altered(self):
     """bibmatch - check altered match"""
     records = create_records(self.recxml3)
     self.assertTrue(not record_has_field(records[0][0], '001'))
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records,
                                                           modify=1)
     self.assertTrue(record_has_field(matchedrecs[0][0], '001'))
 def test_check_altered(self):
     """bibmatch - check altered match"""
     records = create_records(self.recxml4)
     self.assertTrue(not record_has_field(records[0][0], '001'))
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
                                                           modify=1, \
                                                           verbose=0)
     self.assertTrue(record_has_field(matchedrecs[0][0], '001'))
 def test_check_altered(self):
     """bibmatch - check altered match"""
     from invenio.bibrecord import record_has_field
     records = create_records(self.recxml4)
     self.assertTrue(not record_has_field(records[0][0], '001'))
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
                                                           modify=1, \
                                                           verbose=0)
     self.assertTrue(record_has_field(matchedrecs[0][0], '001'))
Beispiel #4
0
def _record_has_id_p(record, recid, rec_oaiid, rec_sysno):
    """Check if record matches any of the given IDs."""
    if record_has_field(record, "001"):
        if record_get_field_value(record, "001", "%", "%") == str(recid):
            return True
    if record_has_field(record, OAIID_TAG[0:3]):
        if record_get_field_value(record, OAIID_TAG[0:3], OAIID_TAG[3], OAIID_TAG[4], OAIID_TAG[5]) == rec_oaiid:
            return True
    if record_has_field(record, SYSNO_TAG[0:3]):
        if record_get_field_value(record, SYSNO_TAG[0:3], SYSNO_TAG[3], SYSNO_TAG[4], SYSNO_TAG[5]) == rec_sysno:
            return True
    return False
Beispiel #5
0
def _record_has_id_p(record, recid, rec_oaiid, rec_sysno):
    """Check if record matches any of the given IDs."""
    if record_has_field(record, '001'):
        if (record_get_field_value(record, '001', '%', '%') == str(recid)):
            return True
    if record_has_field(record, OAIID_TAG[0:3]):
        if (record_get_field_value(record, OAIID_TAG[0:3], OAIID_TAG[3],
                                   OAIID_TAG[4], OAIID_TAG[5]) == rec_oaiid):
            return True
    if record_has_field(record, SYSNO_TAG[0:3]):
        if (record_get_field_value(record, SYSNO_TAG[0:3], SYSNO_TAG[3],
                                   SYSNO_TAG[4], SYSNO_TAG[5]) == rec_sysno):
            return True
    return False
Beispiel #6
0
def merge_record_with_template(rec, template_name):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    if not template:
        return
    template_bibrec = create_record(template)[0]

    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec,
                                 field_tag,
                                 field_instance[1],
                                 field_instance[2],
                                 subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(
                    template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(
                                field_instance, code,
                                field_get_subfield_values(
                                    template_field_instance, code)[0])
    return rec
def merge_record_with_template(rec, template_name, is_hp_record=False):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    if not template:
        return
    template_bibrec = create_record(template)[0]
    # if the record is a holding pen record make all subfields volatile
    if is_hp_record:
        record_make_all_subfields_volatile(template_bibrec)
    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec,
                                 field_tag,
                                 field_instance[1],
                                 field_instance[2],
                                 subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(
                    template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(
                                field_instance, code,
                                field_get_subfield_values(
                                    template_field_instance, code)[0])
    record_order_subfields(rec)
    return rec
Beispiel #8
0
def merge_record_with_template(rec, template_name, is_hp_record=False):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    if not template:
        return
    template_bibrec = create_record(template)[0]
    # if the record is a holding pen record make all subfields volatile
    if is_hp_record:
        record_make_all_subfields_volatile(template_bibrec)
    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec, field_tag, field_instance[1],
                                 field_instance[2], subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(field_instance, code,
                                               field_get_subfield_values(template_field_instance,
                                               code)[0])
    return rec
Beispiel #9
0
def update_references(recid, overwrite=True):
    """Update references for a record

    First, we extract references from a record.
    Then, we are not updating the record directly but adding a bibupload
    task in -c mode which takes care of updating the record.

    Parameters:
    * recid: the id of the record
    """

    if not overwrite:
        # Check for references in record
        record = get_record(recid)
        if record and record_has_field(record, '999'):
            raise RecordHasReferences('Record has references and overwrite ' \
                                      'mode is disabled: %s' % recid)

    if get_fieldvalues(recid, '999C59'):
        raise RecordHasReferences('Record has been curated: %s' % recid)

    # Parse references
    references_xml = extract_references_from_record_xml(recid)

    # Save new record to file
    (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME,
                                   dir=CFG_TMPSHAREDDIR)
    temp_file = os.fdopen(temp_fd, 'w')
    temp_file.write(references_xml.encode('utf-8'))
    temp_file.close()

    # Update record
    task_low_level_submission('bibupload', 'refextract', '-P', '5', '-c',
                              temp_path)
def update_references(recid, overwrite=True):
    """Update references for a record

    First, we extract references from a record.
    Then, we are not updating the record directly but adding a bibupload
    task in -c mode which takes care of updating the record.

    Parameters:
    * recid: the id of the record
    """

    if not overwrite:
        # Check for references in record
        record = get_record(recid)
        if record and record_has_field(record, "999"):
            raise RecordHasReferences("Record has references and overwrite " "mode is disabled: %s" % recid)

    if get_fieldvalues(recid, "999C59"):
        raise RecordHasReferences("Record has been curated: %s" % recid)

    # Parse references
    references_xml = extract_references_from_record_xml(recid)

    # Save new record to file
    (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME, dir=CFG_TMPSHAREDDIR)
    temp_file = os.fdopen(temp_fd, "w")
    temp_file.write(references_xml.encode("utf-8"))
    temp_file.close()

    # Update record
    task_low_level_submission("bibupload", "refextract", "-P", "5", "-c", temp_path)
def inject_recid(data):
    """ """
    updated_records = []
    for match in data:
        original_record_bibrec = create_records(match)[0][0]
        if not record_has_field(original_record_bibrec, '001'):
            rec_id = re_matched_recid.findall(match)[0][1]
            record_add_field(original_record_bibrec, tag='001', controlfield_value=rec_id)
        updated_records.append(original_record_bibrec)
    return updated_records
def record_get_recid(record):
    """
    Returns the recid (tag 001) of the given record, if found in the database.
    It tries to extract an OAI ID from the given record, if not successful it
    returns with errorcode 0.

    @param record: bibrecord structure

    @return: recid if found, otherwise 0 on missing OAI, -1 on OAI tag error,
                 or None if no recid found.
    """
    recid = None
    if record_has_field(record, "001"):
        return str(record_get_field_value(record, tag="001"))

    oai_id = None
    # FIXME: CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG is not set correctly for inspire
    # When OAI config is OK, use bibrecord.record_get_oaiid
    old_oaiid_tag = "035__z"
    try:
        tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]
        ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
        ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
        code = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
    except IndexError:
        sys.stderr.write("Invalid CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG")
        return - 1
    fieldvalues = record_get_field_values(record, tag, ind1, ind2, code)
    for fieldvalue in fieldvalues:
        if fieldvalue.startswith("oai:arXiv.org:"):
            oai_id = fieldvalue
            break
    if oai_id == None:
        fieldvalues = record_get_field_values(record, old_oaiid_tag[:3], \
                                              old_oaiid_tag[3], old_oaiid_tag[4], \
                                              old_oaiid_tag[5])
        for fieldvalue in fieldvalues:
            if fieldvalue.startswith("oai:arXiv.org:"):
                oai_id = fieldvalue
                break
        if oai_id == None:
            sys.stderr.write("No oai id found for record")
            return 0
    queries = ["%s__%s:%s" % (tag, code, oai_id)]
    queries.append("%s__%s:%s" % (old_oaiid_tag[:3], old_oaiid_tag[5], oai_id))
    queries.append("reportnumber:arXiv:%s" % (oai_id.split(":")[-1],))
    for query in queries:
        hits = search_pattern(p=query).tolist()
        # Try different patterns
        if len(hits) == 1:
            return str(hits[0])
    return None
def record_get_recid(record):
    """
    Returns the recid (tag 001) of the given record, if found in the database.
    It tries to extract an OAI ID from the given record, if not successful it
    returns with errorcode 0.

    @param record: bibrecord structure

    @return: recid if found, otherwise 0 on missing OAI, -1 on OAI tag error,
                 or None if no recid found.
    """
    recid = None
    if record_has_field(record, "001"):
        return str(record_get_field_value(record, tag="001"))

    oai_id = None
    # FIXME: CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG is not set correctly for inspire
    # When OAI config is OK, use bibrecord.record_get_oaiid
    old_oaiid_tag = "035__z"
    try:
        tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]
        ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
        ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
        code = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
    except IndexError:
        sys.stderr.write("Invalid CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG")
        return -1
    fieldvalues = record_get_field_values(record, tag, ind1, ind2, code)
    for fieldvalue in fieldvalues:
        if fieldvalue.startswith("oai:arXiv.org:"):
            oai_id = fieldvalue
            break
    if oai_id == None:
        fieldvalues = record_get_field_values(record, old_oaiid_tag[:3], \
                                              old_oaiid_tag[3], old_oaiid_tag[4], \
                                              old_oaiid_tag[5])
        for fieldvalue in fieldvalues:
            if fieldvalue.startswith("oai:arXiv.org:"):
                oai_id = fieldvalue
                break
        if oai_id == None:
            sys.stderr.write("No oai id found for record")
            return 0
    queries = ["%s__%s:%s" % (tag, code, oai_id)]
    queries.append("%s__%s:%s" % (old_oaiid_tag[:3], old_oaiid_tag[5], oai_id))
    queries.append("reportnumber:arXiv:%s" % (oai_id.split(":")[-1], ))
    for query in queries:
        hits = search_pattern(p=query).tolist()
        # Try different patterns
        if len(hits) == 1:
            return str(hits[0])
    return None
Beispiel #14
0
def add_recid(record, recid):
    """
    Add a given record-id to the record as $$001 controlfield. If an 001 field already
    exists it will be replaced.

    @param record: the record to retrive field-values from
    @type record: a bibrecord instance

    @param recid: record-id to be added
    @type recid: int
    """
    if record_has_field(record, '001'):
        record_modify_controlfield(record, '001', \
                                   controlfield_value=str(recid), \
                                   field_position_global=1)
    else:
        record_add_field(record, '001', controlfield_value=str(recid))
def add_recid(record, recid):
    """
    Add a given record-id to the record as $$001 controlfield. If an 001 field already
    exists it will be replaced.

    @param record: the record to retrive field-values from
    @type record: a bibrecord instance

    @param recid: record-id to be added
    @type recid: int
    """
    if record_has_field(record, '001'):
        record_modify_controlfield(record, '001', \
                                   controlfield_value=str(recid), \
                                   field_position_global=1)
    else:
        record_add_field(record, '001', controlfield_value=str(recid))
Beispiel #16
0
def merge_record_with_template(rec, template_name):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    template_bibrec = create_record(template)[0]

    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(
                                field_instance, code, field_get_subfield_values(template_field_instance, code)[0]
                            )
    return rec
def parse_resultfile(data, recid_patterns=(re_original_id,), recids=[],
                     sysno_patterns=None, preserved_tags=[]):
    """
    This function will look for the original recid and any matching recids in a
    BibMatch result file containing references to matching records in comments before
    every record in MARCXML format.

    Returns a list of BibRec structure with found recids for original and matching records.
    """
    record_pairs = []
    sysno_gen = get_sysno_generator()
    options = {'text-marc':1, 'aleph-marc':0}
    for index, match in enumerate(data):
        original_record_bibrec = create_records(match)[0][0]
        if record_has_field(original_record_bibrec, '001'):
            rec_id = record_get_field_value(original_record_bibrec, '001')
        else:
            sysno = sysno_gen.next()
            original_record_marc = create_marc_record(original_record_bibrec, sysno, options)
            rec_id = ""
            for pattern in recid_patterns:
                matches = pattern.findall(original_record_marc)
                if len(matches) > 0:
                    rec_id = matches[0]
                    break
        if recids:
            matching_result_recids = [recids[index]]
        else:
            matching_result_recids = re_matched_recid.findall(match)
        matching_result_sysnos = []
        preserved_fields = {}
        print preserved_tags
        for tag in preserved_tags:
            try:
                print 'doing it' + tag
                preserved_fields[tag] = original_record_bibrec[tag]
            except KeyError:
                pass
        record_pairs.append((rec_id, matching_result_recids, matching_result_sysnos, preserved_fields))
    return record_pairs
Beispiel #18
0
def add_field(rec1, rec2, fnum, findex1, findex2):
    """Adds the field of rec2 into rec1 in a position that depends on the
    diffing of rec1 with rec2.
    @param rec1: First record (a record dictionary structure)
    @param rec2: Second record (a record dictionary structure)
    @param fnum: a 3 characters long string indicating field tag number
    @param findex1: the rec1 field position in the group of fields it belongs
    @param findex2: the rec2 field position in the group of fields it belongs
    """
    field_to_add = rec2[fnum][findex2]
    ### if findex1 indicates an existing field in rec1, insert the field of rec2
    ### before the field of rec1
    if findex1 is not None:
        record_add_fields(rec1, fnum, [field_to_add], findex1)
        return

    ### check if field tag does not exist in record1
    if not record_has_field(rec1, fnum):
        record_add_fields(rec1, fnum, [field_to_add])  #insert at the beginning
        return

    ### if findex1 is None and the fieldtag already exists
    #get diffs for all indicators of the field.
    alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields)
    alldiffs = alldiffs[
        1]  #keep only the list of diffs by indicators (without the 'c')
    diff = _combine_diffs(alldiffs)  #combine results in one list

    #find the position of the field after which the insertion should take place
    findex1 = -1
    for m in diff:
        if m[1] == findex2:
            break
        if m[0] is not None:
            findex1 = m[0]
    #finally add the field (one position after)
    record_add_fields(rec1, fnum, [field_to_add], findex1 + 1)
Beispiel #19
0
def add_field(rec1, rec2, fnum, findex1, findex2):
    """Adds the field of rec2 into rec1 in a position that depends on the
    diffing of rec1 with rec2.
    @param rec1: First record (a record dictionary structure)
    @param rec2: Second record (a record dictionary structure)
    @param fnum: a 3 characters long string indicating field tag number
    @param findex1: the rec1 field position in the group of fields it belongs
    @param findex2: the rec2 field position in the group of fields it belongs
    """
    field_to_add = rec2[fnum][findex2]
    ### if findex1 indicates an existing field in rec1, insert the field of rec2
    ### before the field of rec1
    if findex1 is not None:
        record_add_fields(rec1, fnum, [field_to_add], findex1)
        return

    ### check if field tag does not exist in record1
    if not record_has_field(rec1, fnum):
        record_add_fields(rec1, fnum, [field_to_add]) #insert at the beginning
        return

    ### if findex1 is None and the fieldtag already exists
    #get diffs for all indicators of the field.
    alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields)
    alldiffs = alldiffs[1] #keep only the list of diffs by indicators (without the 'c')
    diff = _combine_diffs(alldiffs) #combine results in one list

    #find the position of the field after which the insertion should take place
    findex1 = -1
    for m in diff:
        if m[1] == findex2:
            break
        if m[0] is not None:
            findex1 = m[0]
    #finally add the field (one position after)
    record_add_fields(rec1, fnum, [field_to_add], findex1+1)
Beispiel #20
0
def merge_field_group(rec1,
                      rec2,
                      fnum,
                      ind1='',
                      ind2='',
                      merge_conflicting_fields=False):
    """Merges non-conflicting fields from 'rec2' to 'rec1' for a specific tag.
    the second record.
    @param rec1: First record (a record dictionary structure)
    @param rec2: Second record (a record dictionary structure)
    @param fnum: a 3 characters long string indicating field tag number
    @param ind1: a 1 character long string
    @param ind2: a 1 character long string
    @param merge_conflicting_fields: whether to merge conflicting fields or not
    """
    ### Check if merging goes for all indicators and set a boolean
    merging_all_indicators = not ind1 and not ind2

    ### check if there is no field in rec2 to be merged in rec1
    if not record_has_field(rec2, fnum):
        return

    ### get fields of rec2
    if merging_all_indicators:
        fields2 = record_get_field_instances(rec2, fnum, '%', '%')
    else:
        fields2 = record_get_field_instances(rec2, fnum, ind1, ind2)
    if len(fields2) == 0:
        return

    ### check if field in rec1 doesn't even exist
    if not record_has_field(rec1, fnum):
        record_add_fields(rec1, fnum, fields2)
        return

    ### compare the fields, get diffs for given indicators
    alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields,
                                 ind1, ind2)

    ### check if fields are the same
    if alldiffs is None:
        return  #nothing to merge

    ### find the diffing for the fields of the given indicators

    alldiffs = alldiffs[
        1]  #keep only the list of diffs by indicators (without the 'c')

    if merging_all_indicators:
        #combine the diffs for each indicator to one list
        diff = _combine_diffs(alldiffs)
    else:  #diffing for one indicator
        for diff in alldiffs:  #look for indicator pair in diff result
            if diff[0] == (ind1, ind2):
                break
        else:
            raise Exception, "Indicators not in diff result."
        diff = diff[
            1]  #keep only the list of diffs (without the indicator tuple)

    ### proceed to merging fields in a new field list
    fields1, fields2 = rec1[fnum], rec2[fnum]
    new_fields = []
    if merge_conflicting_fields == False:  #merge non-conflicting fields
        for m in diff:  #for every match of fields in the diff
            if m[0] is not None:  #if rec1 has a field in the diff, keep it
                new_fields.append(deepcopy(fields1[m[0]]))
            else:  #else take the field from rec2
                new_fields.append(deepcopy(fields2[m[1]]))
    else:  #merge all fields
        for m in diff:  #for every match of fields in the diff
            if m[1] is not None:  #if rec2 has a field, add it
                new_fields.append(deepcopy(fields2[m[1]]))
                if m[0] is not None and fields1[m[0]][0] != fields2[m[1]][0]:
                    #if the fields are not the same then add the field of rec1
                    new_fields.append(deepcopy(fields1[m[0]]))
            else:
                new_fields.append(deepcopy(fields1[m[0]]))

    ### delete existing fields
    record_delete_field(rec1, fnum, ind1, ind2)
    ## find where the new_fields should be inserted in rec1 (insert_index)
    if merging_all_indicators:
        insert_index = 0
    else:
        insert_index = None
        ind_pair = (ind1, ind2)
        first_last_dict = _first_and_last_index_for_each_indicator(
            rec1.get(fnum, []))
        #find the indicator pair which is just before the one which will be inserted
        indicators = first_last_dict.keys()
        indicators.sort()
        ind_pair_before = None
        for pair in indicators:
            if pair > ind_pair:
                break
            else:
                ind_pair_before = pair
        if ind_pair_before is None:  #if no smaller indicator pair exists
            insert_index = 0  #insertion will take place at the beginning
        else:  #else insert after the last field index of the previous indicator pair
            insert_index = first_last_dict[ind_pair_before][1] + 1

    ### add the new (merged) fields in correct 'in_field_index' position
    record_add_fields(rec1, fnum, new_fields, insert_index)
    return
Beispiel #21
0
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \
                  operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0):
    """ Match passed records with existing records on a local or remote Invenio
    installation. Returns which records are new (no match), which are matched,
    which are ambiguous and which are fuzzy-matched. A formatted result of each
    records matching are appended to each record tuple:
    (record, status_code, list_of_errors, result)

    @param records: records to analyze
    @type records: list of records

    @param qrystrs: Querystrings
    @type qrystrs: list of object

    @param server_url: which server to search on. Local installation by default
    @type server_url: str

    @param perform_request_search_mode: run the query in this mode
    @type perform_request_search_mode: string

    @param operator: "o" "a"
    @type operator: str

    @param verbose: be loud
    @type verbose: int

    @param modify: output modified records of matches
    @type modify: int

    @rtype: list of lists
    @return an array of arrays of records, like this [newrecs,matchedrecs,
                                                      ambiguousrecs,fuzzyrecs]
    """

    server = InvenioConnector(server_url)

    newrecs = []
    matchedrecs = []
    ambiguousrecs = []
    fuzzyrecs = []

    record_counter = 0
    for rec in records:
        record_counter += 1
        if (verbose > 1):
            sys.stderr.write("\n Processing record: #%d .." % record_counter)

        if qrystrs == None:
            qrystrs = []

        if len(qrystrs)==0:
            qrystrs.append("")

        more_detailed_info = ""

        for qrystr in qrystrs:
            querystring = Querystring()
            querystring.default()

            if(qrystr != ""):
                querystring.from_qrystr(qrystr,
                                        perform_request_search_mode,
                                        operator)
            else:
                querystring.default()

            querystring.search_engine_encode()

            ### get field values for record instance

            inst = []

            ### get appropriate fields from database
            for field in querystring.field:
                tags = get_field_tags(field)
                if len(tags) > 0:
                    # Fetch value from input record of first tag only
                    # FIXME: Extracting more then first tag, evaluating each
                    field = tags[0]
                ### use expanded tags
                tag  = field[0:3]
                ind1 = field[3:4]
                ind2 = field[4:5]
                code = field[5:6]

                if((ind1 == "_")or(ind1 == "%")):
                    ind1 = ""
                if((ind2 == "_")or(ind2 == "%")):
                    ind2 = ""
                if((code == "_")or(code == "%")):
                    code = "a"

                if(field != "001"):
                    finsts = record_get_field_instances(rec[0], tag, ind1, ind2)
                    sbf = get_subfield(finsts, code)
                    inst.append(sbf)
                elif(field in ["001"]):
                    sbf = record_get_field_values(rec[0], field, ind1="",
                                                  ind2="", code="")
                    inst.append(sbf)
                else:
                    inst.append("")


            ### format acquired field values

            i = 0
            for instance in inst:
                for format in querystring.format[i]:
                    inst[i] = bibconvert.FormatField(inst[i], format)
                i += 1

            ### perform the search

            if(inst[0] != ""):
                p1 = inst[0]
                f1 = querystring.field[0]
                m1 = querystring.mode[0]
                op1 = querystring.operator[0]

                p2 = inst[1]
                f2 = querystring.field[1]
                m2 = querystring.mode[1]
                op2 = querystring.operator[1]

                p3 = inst[2]
                f3 = querystring.field[2]
                m3 = querystring.mode[2]

                #1st run the basic perform_req_search
                recID_list = server.search(
                    p1=p1, f1=f1, m1=m1, op1=op1,
                    p2=p2, f2=f2, m2=m2, op2=op2,
                    p3=p3, f3=f3, m3=m3, of='id')

                if (verbose > 8):
                    sys.stderr.write("\nperform_request_search with values"+\
                     " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\
                     " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\
                     " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\
                     " result="+str(recID_list)+"\n")

                if len(recID_list) > 1: #ambig match
                    ambiguousrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "ambiguous-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("ambiguous\n")
                if len(recID_list) == 1: #match
                    if modify:
                        if record_has_field(rec[0], '001'):
                            record_modify_controlfield(rec[0], '001', \
                                                       controlfield_value=str(recID_list[0]), \
                                                       field_position_global=1)
                        else:
                            record_add_field(rec[0], '001', controlfield_value=str(recID_list[0]))
                    matchedrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "exact-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("match\n")
                if len(recID_list) == 0: #no match..
                    #try fuzzy matching
                    intersected = None
                    #check if all the words appear in the
                    #field of interest
                    words1 = main_words_list(p1)
                    words2 = main_words_list(p2)
                    words3 = main_words_list(p3)

                    for word in words1:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f1, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words2:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f2, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words3:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f3, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    if intersected:
                        #this was a fuzzy match
                        if modify:
                            if record_has_field(rec[0], '001'):
                                record_modify_controlfield(rec[0], '001', \
                                      controlfield_value=str(intersected[0]), field_position_global=1)
                            else:
                                record_add_field(rec[0], '001', controlfield_value=str(intersected[0]))
                        fuzzyrecs.append(rec + (match_result_output(intersected, \
                                                server_url, querystring, "fuzzy-matched"), ))
                        if (verbose > 8):
                            sys.stderr.write("fuzzy\n")
                    else:
                        #no match
                        newrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring), ))
                        if (verbose > 8):
                            sys.stderr.write("new\n")
    #return results
    return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
Beispiel #22
0
def merge_field_group(rec1, rec2, fnum, ind1='', ind2='', merge_conflicting_fields=False):
    """Merges non-conflicting fields from 'rec2' to 'rec1' for a specific tag.
    the second record.
    @param rec1: First record (a record dictionary structure)
    @param rec2: Second record (a record dictionary structure)
    @param fnum: a 3 characters long string indicating field tag number
    @param ind1: a 1 character long string
    @param ind2: a 1 character long string
    @param merge_conflicting_fields: whether to merge conflicting fields or not
    """
    ### Check if merging goes for all indicators and set a boolean
    merging_all_indicators = not ind1 and not ind2

    ### check if there is no field in rec2 to be merged in rec1
    if not record_has_field(rec2, fnum):
        return

    ### get fields of rec2
    if merging_all_indicators:
        fields2 = record_get_field_instances(rec2, fnum, '%', '%')
    else:
        fields2 = record_get_field_instances(rec2, fnum, ind1, ind2)
    if len(fields2)==0:
        return

    ### check if field in rec1 doesn't even exist
    if not record_has_field(rec1, fnum):
        record_add_fields(rec1, fnum, fields2)
        return

    ### compare the fields, get diffs for given indicators
    alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields, ind1, ind2)

    ### check if fields are the same
    if alldiffs is None:
        return #nothing to merge

    ### find the diffing for the fields of the given indicators

    alldiffs = alldiffs[1] #keep only the list of diffs by indicators (without the 'c')

    if merging_all_indicators:
        #combine the diffs for each indicator to one list
        diff = _combine_diffs(alldiffs)
    else: #diffing for one indicator
        for diff in alldiffs:  #look for indicator pair in diff result
            if diff[0] == (ind1, ind2):
                break
        else:
            raise Exception, "Indicators not in diff result."
        diff = diff[1] #keep only the list of diffs (without the indicator tuple)

    ### proceed to merging fields in a new field list
    fields1, fields2 = rec1[fnum], rec2[fnum]
    new_fields = []
    if merge_conflicting_fields == False: #merge non-conflicting fields
        for m in diff: #for every match of fields in the diff
            if m[0] is not None: #if rec1 has a field in the diff, keep it
                new_fields.append( deepcopy(fields1[m[0]]) )
            else: #else take the field from rec2
                new_fields.append( deepcopy(fields2[m[1]]) )
    else: #merge all fields
        for m in diff: #for every match of fields in the diff
            if m[1] is not None: #if rec2 has a field, add it
                new_fields.append( deepcopy(fields2[m[1]]) )
                if m[0] is not None and fields1[m[0]][0] != fields2[m[1]][0]:
                    #if the fields are not the same then add the field of rec1
                    new_fields.append( deepcopy(fields1[m[0]]) )
            else:
                new_fields.append( deepcopy(fields1[m[0]]) )

    ### delete existing fields
    record_delete_field(rec1, fnum, ind1, ind2)
    ## find where the new_fields should be inserted in rec1 (insert_index)
    if merging_all_indicators:
        insert_index = 0
    else:
        insert_index = None
        ind_pair = (ind1, ind2)
        first_last_dict = _first_and_last_index_for_each_indicator( rec1.get(fnum, []) )
        #find the indicator pair which is just before the one which will be inserted
        indicators = first_last_dict.keys()
        indicators.sort()
        ind_pair_before = None
        for pair in indicators:
            if pair > ind_pair:
                break
            else:
                ind_pair_before = pair
        if ind_pair_before is None: #if no smaller indicator pair exists
            insert_index = 0 #insertion will take place at the beginning
        else:  #else insert after the last field index of the previous indicator pair
            insert_index = first_last_dict[ind_pair_before][1] + 1

    ### add the new (merged) fields in correct 'in_field_index' position
    record_add_fields(rec1, fnum, new_fields, insert_index)
    return
Beispiel #23
0
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \
                  operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0):
    """ Match passed records with existing records on a local or remote Invenio
    installation. Returns which records are new (no match), which are matched,
    which are ambiguous and which are fuzzy-matched. A formatted result of each
    records matching are appended to each record tuple:
    (record, status_code, list_of_errors, result)

    @param records: records to analyze
    @type records: list of records

    @param qrystrs: Querystrings
    @type qrystrs: list of object

    @param server_url: which server to search on. Local installation by default
    @type server_url: str

    @param perform_request_search_mode: run the query in this mode
    @type perform_request_search_mode: string

    @param operator: "o" "a"
    @type operator: str

    @param verbose: be loud
    @type verbose: int

    @param modify: output modified records of matches
    @type modify: int

    @rtype: list of lists
    @return an array of arrays of records, like this [newrecs,matchedrecs,
                                                      ambiguousrecs,fuzzyrecs]
    """

    server = InvenioConnector(server_url)

    newrecs = []
    matchedrecs = []
    ambiguousrecs = []
    fuzzyrecs = []

    record_counter = 0
    for rec in records:
        record_counter += 1
        if (verbose > 1):
            sys.stderr.write("\n Processing record: #%d .." % record_counter)

        if qrystrs == None:
            qrystrs = []

        if len(qrystrs)==0:
            qrystrs.append("")

        more_detailed_info = ""

        for qrystr in qrystrs:
            querystring = Querystring()
            querystring.default()

            if(qrystr != ""):
                querystring.from_qrystr(qrystr,
                                        perform_request_search_mode,
                                        operator)
            else:
                querystring.default()

            querystring.search_engine_encode()

            ### get field values for record instance

            inst = []

            ### get appropriate fields from database
            for field in querystring.field:
                tags = get_field_tags(field)
                if len(tags) > 0:
                    # Fetch value from input record of first tag only
                    # FIXME: Extracting more then first tag, evaluating each
                    field = tags[0]
                ### use expanded tags
                tag  = field[0:3]
                ind1 = field[3:4]
                ind2 = field[4:5]
                code = field[5:6]

                if((ind1 == "_")or(ind1 == "%")):
                    ind1 = ""
                if((ind2 == "_")or(ind2 == "%")):
                    ind2 = ""
                if((code == "_")or(code == "%")):
                    code = "a"

                if(field != "001"):
                    finsts = record_get_field_instances(rec[0], tag, ind1, ind2)
                    sbf = get_subfield(finsts, code)
                    inst.append(sbf)
                elif(field in ["001"]):
                    sbf = record_get_field_values(rec[0], field, ind1="",
                                                  ind2="", code="")
                    inst.append(sbf)
                else:
                    inst.append("")


            ### format acquired field values

            i = 0
            for instance in inst:
                for format in querystring.format[i]:
                    inst[i] = bibconvert.FormatField(inst[i], format)
                i += 1

            ### perform the search

            if(inst[0] != ""):
                p1 = inst[0]
                f1 = querystring.field[0]
                m1 = querystring.mode[0]
                op1 = querystring.operator[0]

                p2 = inst[1]
                f2 = querystring.field[1]
                m2 = querystring.mode[1]
                op2 = querystring.operator[1]

                p3 = inst[2]
                f3 = querystring.field[2]
                m3 = querystring.mode[2]

                #1st run the basic perform_req_search
                recID_list = server.search(
                    p1=p1, f1=f1, m1=m1, op1=op1,
                    p2=p2, f2=f2, m2=m2, op2=op2,
                    p3=p3, f3=f3, m3=m3, of='id')

                if (verbose > 8):
                    sys.stderr.write("\nperform_request_search with values"+\
                     " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\
                     " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\
                     " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\
                     " result="+str(recID_list)+"\n")

                if len(recID_list) > 1: #ambig match
                    ambiguousrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "ambiguous-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("ambiguous\n")
                if len(recID_list) == 1: #match
                    if modify:
                        if record_has_field(rec[0], '001'):
                            record_modify_controlfield(rec[0], '001', \
                                                       controlfield_value=str(recID_list[0]), \
                                                       field_position_global=1)
                        else:
                            record_add_field(rec[0], '001', controlfield_value=str(recID_list[0]))
                    matchedrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "exact-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("match\n")
                if len(recID_list) == 0: #no match..
                    #try fuzzy matching
                    intersected = None
                    #check if all the words appear in the
                    #field of interest
                    words1 = main_words_list(p1)
                    words2 = main_words_list(p2)
                    words3 = main_words_list(p3)

                    for word in words1:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f1, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words2:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f2, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words3:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f3, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    if intersected:
                        #this was a fuzzy match
                        if modify:
                            if record_has_field(rec[0], '001'):
                                record_modify_controlfield(rec[0], '001', \
                                      controlfield_value=str(intersected[0]), field_position_global=1)
                            else:
                                record_add_field(rec[0], '001', controlfield_value=str(intersected[0]))
                        fuzzyrecs.append(rec + (match_result_output(intersected, \
                                                server_url, querystring, "fuzzy-matched"), ))
                        if (verbose > 8):
                            sys.stderr.write("fuzzy\n")
                    else:
                        #no match
                        newrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring), ))
                        if (verbose > 8):
                            sys.stderr.write("new\n")
    #return results
    return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]