def test_limw_default(self): """bibconvert - LIMW(,)""" test_input = "ab cd xx 12 34" self.assertEqual(test_input, bibconvert.FormatField(test_input, "LIMW(,)")) self.assertEqual(test_input, bibconvert.FormatField(test_input, "LIMW(,R)"))
def test_limw_left_regex(self): """bibconvert - LIMW(c,L) with regular expression""" test_input = "ab cd xx 12 34" self.assertEqual("ab ", bibconvert.FormatField(test_input, "LIMW(//\s//,R)")) self.assertEqual(test_input, bibconvert.FormatField(test_input, "LIMW(//[!_-]//,R)"))
def test_words_right(self): """bibconvert - WORDS(n,R)""" test_input = "ab cd xx 12 34" self.assertEqual("ab cd", bibconvert.FormatField(test_input, "WORDS(2,R)")) test_input = "Sep 1999" self.assertEqual("Sep", bibconvert.FormatField(test_input, "WORDS(1,R)"))
def test_words_left(self): """bibconvert - WORDS(n,L)""" test_input = "ab cd xx 12 34" self.assertEqual("12 34", bibconvert.FormatField(test_input, "WORDS(2,L)")) test_input = "Sep 1999" self.assertEqual("1999", bibconvert.FormatField(test_input, "WORDS(1,L)"))
def test_limw_right(self): """bibconvert - LIMW(c,R)""" test_input = "ab cd xx 12 34" self.assertEqual("ab ", bibconvert.FormatField(test_input, "LIMW( ,R)")) test_input = "sep_1999" self.assertEqual("sep_", bibconvert.FormatField(test_input, "LIMW(_,R)"))
def test_lim_left(self): """bibconvert - LIM(n,L)""" test_input = "ab cd xx 12 34" self.assertEqual("2 34", bibconvert.FormatField(test_input, "LIM(4,L)")) test_input = "sep_1999" self.assertEqual("1999", bibconvert.FormatField(test_input, "LIM(4,L)"))
def test_ff_regex(self): """bibconvert - formatting functions with regular expression""" self.assertEqual( "Hello world!", bibconvert.FormatField("Hellx wyrld!", "REP(//[xy]//,o)")) self.assertEqual( "Hello world!", bibconvert.FormatField("Hello world!", "REP(//[abc]//,o)")) self.assertEqual( "Hello world!", bibconvert.FormatField("Hello world! @", "EXP(//[@_]//,1)")) self.assertEqual( "Hello world!", bibconvert.FormatField("Hello world! abc", "EXP(//[oz]+//,0)")) self.assertEqual( "Hello world!", bibconvert.FormatField("Hello world!", "EXP(//[abc]+//,1)")) self.assertEqual( "lala", bibconvert.FormatField("Hello world!", "IF(//^Hello .*!$//,lala,lolo)")) self.assertEqual( "lolo", bibconvert.FormatField("Hello world!", "IF(//^Hello .*x$//,lala,lolo)"))
def create_query(self, record, qrystr="[title]"): """ Main method that parses and generates a search query from given query-string structure and record data. Returns the resulting query-string and completeness determination as a tuple. @param record: bibrecord to retrive field-values from @type record: dict @param qrystr: proper query string template. (i.e. title:[245__a]) defaults to: [title] @type qrystr: str @return: (query-string, complete flag) @rtype: tuple """ if qrystr == "": qrystr = "[title]" if "||" in qrystr or not "[" in qrystr: # Assume old style query-strings qrystr = self._convert_qrystr(qrystr) # FIXME: Convert to lower case, we do this to account for fuzzy_parser # which treats everything lower-case, and may cause KeyError when # retrieving data from the self.fields dict. # Also BibConvert formats are currently case sensitive. self.pattern = qrystr.lower() self.fields = {} complete = True fieldtags_found = [] # Find all potential references to record tag values and # add to fields-dict as a list of values using fieldname as key for field_reference in re_querystring.findall(qrystr): # First we see if there is any special formats for this field_reference # This is done before transforming to lower case, as BibConvert formats # are case-sensitive fieldname = self._extract_formats(field_reference) self.pattern = self.pattern.replace( "[%s]" % (field_reference.lower(), ), "[%s]" % (fieldname, )) # Find proper MARC tag(s) for the fieldname tag_list = get_field_tags_from_fieldname(fieldname) if len(tag_list) == 0: tag_list = [fieldname] for field in tag_list: # Check if it is really a reference to a tag to not confuse with e.g. regex syntax if re_valid_tag.match(field) != None: tag = field[0:3] ind1 = field[3:4] ind2 = field[4:5] code = field[5:6] if ind1 == "_" or ind1 == "%": ind1 = "" if ind2 == "_" or ind2 == "%": ind2 = "" value_list = record_get_field_values( record, tag, ind1, ind2, code) for value in value_list: if value.strip() != "": # Apply formats if applicable for aformat in self.formats.get(fieldname, []): value = bibconvert.FormatField(value, aformat) self.fields.setdefault(fieldname, []).append( (fieldname, value)) # Add fieldname to found tags, so we can check completeness later fieldtags_found.append(fieldname) # Is the query deemed complete? i.e. did we find data for all field-name references complete = not bool( [n for n in fieldtags_found if n not in self.fields]) # Now determine the Cartesian product over all found values, # then iterate over each combination to generate proper query all_queries = [] query_tuples = cproduct(self.fields.values()) for query in query_tuples: new_query = self.pattern for fieldname, value in query: new_query = new_query.replace("[%s]" % (fieldname, ), value) all_queries.append(new_query) # Finally we concatenate all queries into one, delimited by chosen operator self.query = self.operator.join(set(all_queries)) if not complete: # Clean away field-name references not found for fieldtag in fieldtags_found: self.query = self.query.replace("[%s]" % (fieldtag, ), "") # Clean query? if self.clean: self._clean_query() return self.query, complete
def test_gff(self): """bibconvert - global formatting functions""" self.assertEqual("Hello world!", bibconvert.FormatField("Hello world!", "DEFP()"))
def test_ff(self): """bibconvert - formatting functions""" self.assertEqual("Hello world!", bibconvert.FormatField("ello world", "ADD(H,!)")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world", "ABR(11,!)")) self.assertEqual("Hello world!", bibconvert.FormatField("xHello world!x", "CUT(x,x)")) self.assertEqual("Hello world!", bibconvert.FormatField("He11o wor1d!", "REP(1,l)")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world!", "SUP(NUM)")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world!", "LIM(12,R)")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world!", "WORDS(2)")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world!", "MINL(5)")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world!", "MAXL(12)")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world! @", "EXP(@,1)")) self.assertEqual( "Hello world!", bibconvert.FormatField("Hello world!", "IF(Hello world!,ORIG,)")) self.assertEqual("", bibconvert.FormatField("Hello world!", "NUM()")) self.assertEqual("Hello world!", bibconvert.FormatField("Hello world! ", "SHAPE()")) self.assertEqual("HELLO WORLD!", bibconvert.FormatField("Hello world!", "UP()")) self.assertEqual("hello world!", bibconvert.FormatField("Hello world!", "DOWN()")) self.assertEqual("Hello World!", bibconvert.FormatField("Hello world!", "CAP()"))
def test_words_exceed_wordcount(self): """bibconvert - WORDS(2,R) when less then 2 words in value""" test_input = "ab" self.assertEqual(test_input, bibconvert.FormatField(test_input, "WORDS(2,R)"))
def test_words_default(self): """bibconvert - WORDS(,)""" test_input = "ab cd xx 12 34" self.assertEqual(test_input, bibconvert.FormatField(test_input, "WORDS(,)"))
def test_limw_left(self): """bibconvert - LIMW(c,L)""" test_input = "ab cd xx 12 34" self.assertEqual(" cd xx 12 34", bibconvert.FormatField(test_input, "LIMW( ,L)"))
def test_regexp(self): """bibconvert - regular expressions""" self.assertEqual( "Hello world!", bibconvert.FormatField("Hello world!", "RE([A-Z][a-z].*!)"))
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \ operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0): """ Match passed records with existing records on a local or remote Invenio installation. Returns which records are new (no match), which are matched, which are ambiguous and which are fuzzy-matched. A formatted result of each records matching are appended to each record tuple: (record, status_code, list_of_errors, result) @param records: records to analyze @type records: list of records @param qrystrs: Querystrings @type qrystrs: list of object @param server_url: which server to search on. Local installation by default @type server_url: str @param perform_request_search_mode: run the query in this mode @type perform_request_search_mode: string @param operator: "o" "a" @type operator: str @param verbose: be loud @type verbose: int @param modify: output modified records of matches @type modify: int @rtype: list of lists @return an array of arrays of records, like this [newrecs,matchedrecs, ambiguousrecs,fuzzyrecs] """ server = InvenioConnector(server_url) newrecs = [] matchedrecs = [] ambiguousrecs = [] fuzzyrecs = [] record_counter = 0 for rec in records: record_counter += 1 if (verbose > 1): sys.stderr.write("\n Processing record: #%d .." % record_counter) if qrystrs == None: qrystrs = [] if len(qrystrs)==0: qrystrs.append("") more_detailed_info = "" for qrystr in qrystrs: querystring = Querystring() querystring.default() if(qrystr != ""): querystring.from_qrystr(qrystr, perform_request_search_mode, operator) else: querystring.default() querystring.search_engine_encode() ### get field values for record instance inst = [] ### get appropriate fields from database for field in querystring.field: tags = get_field_tags(field) if len(tags) > 0: # Fetch value from input record of first tag only # FIXME: Extracting more then first tag, evaluating each field = tags[0] ### use expanded tags tag = field[0:3] ind1 = field[3:4] ind2 = field[4:5] code = field[5:6] if((ind1 == "_")or(ind1 == "%")): ind1 = "" if((ind2 == "_")or(ind2 == "%")): ind2 = "" if((code == "_")or(code == "%")): code = "a" if(field != "001"): finsts = record_get_field_instances(rec[0], tag, ind1, ind2) sbf = get_subfield(finsts, code) inst.append(sbf) elif(field in ["001"]): sbf = record_get_field_values(rec[0], field, ind1="", ind2="", code="") inst.append(sbf) else: inst.append("") ### format acquired field values i = 0 for instance in inst: for format in querystring.format[i]: inst[i] = bibconvert.FormatField(inst[i], format) i += 1 ### perform the search if(inst[0] != ""): p1 = inst[0] f1 = querystring.field[0] m1 = querystring.mode[0] op1 = querystring.operator[0] p2 = inst[1] f2 = querystring.field[1] m2 = querystring.mode[1] op2 = querystring.operator[1] p3 = inst[2] f3 = querystring.field[2] m3 = querystring.mode[2] #1st run the basic perform_req_search recID_list = server.search( p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, of='id') if (verbose > 8): sys.stderr.write("\nperform_request_search with values"+\ " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\ " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\ " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\ " result="+str(recID_list)+"\n") if len(recID_list) > 1: #ambig match ambiguousrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "ambiguous-matched"), )) if (verbose > 8): sys.stderr.write("ambiguous\n") if len(recID_list) == 1: #match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(recID_list[0]), \ field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(recID_list[0])) matchedrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "exact-matched"), )) if (verbose > 8): sys.stderr.write("match\n") if len(recID_list) == 0: #no match.. #try fuzzy matching intersected = None #check if all the words appear in the #field of interest words1 = main_words_list(p1) words2 = main_words_list(p2) words3 = main_words_list(p3) for word in words1: word = "'"+word+"'" ilist = server.search(p=word, f=f1, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words2: word = "'"+word+"'" ilist = server.search(p=word, f=f2, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words3: word = "'"+word+"'" ilist = server.search(p=word, f=f3, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) if intersected: #this was a fuzzy match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(intersected[0]), field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(intersected[0])) fuzzyrecs.append(rec + (match_result_output(intersected, \ server_url, querystring, "fuzzy-matched"), )) if (verbose > 8): sys.stderr.write("fuzzy\n") else: #no match newrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring), )) if (verbose > 8): sys.stderr.write("new\n") #return results return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]