def mangle_value(kb, value): value = re_punctuation.sub(u' ', value.upper()) value = re_group_captured_multiple_space.sub(u' ', value) value = value.strip() standardized_titles = kb[1] if value in standardized_titles: value = standardized_titles[value] return value
def normalize_journal_name(value=None): """normalize journal name via knowledgebase lookup""" if value is None: return '' newvalue = re_punctuation.sub(u' ', value.upper()) newvalue = re_group_captured_multiple_space.sub(u' ', newvalue) newvalue = newvalue.strip() standardized_titles = get_kbs()['journals'][1] return standardized_titles.get(newvalue, value)
def build_journals_kb(knowledgebase): """Given the path to a knowledge base file, read in the contents of that file into a dictionary of search->replace word phrases. The search phrases are compiled into a regex pattern object. The knowledge base file should consist only of lines that take the following format: seek-term --- replace-term (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with the two phrases being separated by 3 hyphens.) E.g.: ASTRONOMY AND ASTROPHYSICS ---Astron. Astrophys. The left-hand side term is a non-standard version of the title, whereas the right-hand side term is the standard version. If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error message is output to standard error and execution is halted with an error-code 0. @param fpath: (string) the path to the knowledge base file. @return: (tuple) containing a list and a dictionary. The list contains compiled regex patterns used as search terms and will be used to force searching order to match that of the knowledge base. The dictionary contains the search->replace terms. The keys of the dictionary are the compiled regex word phrases used for searching in the reference lines; The values in the dictionary are the replace terms for matches. """ # Initialise vars: # dictionary of search and replace phrases from KB: kb = {} standardised_titles = {} seek_phrases = [] # A dictionary of "replacement terms" (RHS) to be inserted into KB as # "seek terms" later, if they were not already explicitly added # by the KB: repl_terms = {} write_message('Processing journals kb', verbose=3) for seek_phrase, repl in knowledgebase: # We match on a simplified line, thus dots are replaced # with spaces seek_phrase = seek_phrase.replace('.', ' ').decode('utf-8').upper() # good KB line # Add the 'replacement term' into the dictionary of # replacement terms: repl_terms[repl] = None # add the phrase from the KB if the 'seek' phrase is longer # compile the seek phrase into a pattern: seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase), re.UNICODE) kb[seek_phrase] = seek_ptn standardised_titles[seek_phrase] = repl seek_phrases.append(seek_phrase) # Now, for every 'replacement term' found in the KB, if it is # not already in the KB as a "search term", add it: for repl_term in repl_terms.keys(): raw_repl_phrase = repl_term.upper() raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase) raw_repl_phrase = \ re_group_captured_multiple_space.sub(u' ', raw_repl_phrase) raw_repl_phrase = raw_repl_phrase.strip() if raw_repl_phrase not in kb: # The replace-phrase was not in the KB as a seek phrase # It should be added. pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase) seek_ptn = re.compile(pattern, re.U) kb[raw_repl_phrase] = seek_ptn standardised_titles[raw_repl_phrase] = repl_term seek_phrases.append(raw_repl_phrase) # Sort the titles by string length (long - short) seek_phrases.sort(_cmp_bystrlen_reverse) write_message('Processed journals kb', verbose=3) # return the raw knowledge base: return kb, standardised_titles, seek_phrases
def build_journals_kb(knowledgebase): """Given the path to a knowledge base file, read in the contents of that file into a dictionary of search->replace word phrases. The search phrases are compiled into a regex pattern object. The knowledge base file should consist only of lines that take the following format: seek-term --- replace-term (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with the two phrases being separated by 3 hyphens.) E.g.: ASTRONOMY AND ASTROPHYSICS ---Astron. Astrophys. The left-hand side term is a non-standard version of the title, whereas the right-hand side term is the standard version. If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error message is output to standard error and execution is halted with an error-code 0. @param fpath: (string) the path to the knowledge base file. @return: (tuple) containing a list and a dictionary. The list contains compiled regex patterns used as search terms and will be used to force searching order to match that of the knowledge base. The dictionary contains the search->replace terms. The keys of the dictionary are the compiled regex word phrases used for searching in the reference lines; The values in the dictionary are the replace terms for matches. """ # Initialise vars: # dictionary of search and replace phrases from KB: kb = {} standardised_titles = {} seek_phrases = [] # A dictionary of "replacement terms" (RHS) to be inserted into KB as # "seek terms" later, if they were not already explicitly added # by the KB: repl_terms = {} write_message('Processing journals kb', verbose=3) for seek_phrase, repl in knowledgebase: # We match on a simplified line, thus dots are replaced # with spaces seek_phrase = seek_phrase.replace('.', ' ').upper() # good KB line # Add the 'replacement term' into the dictionary of # replacement terms: repl_terms[repl] = None # add the phrase from the KB if the 'seek' phrase is longer # compile the seek phrase into a pattern: seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase), re.UNICODE) kb[seek_phrase] = seek_ptn standardised_titles[seek_phrase] = repl seek_phrases.append(seek_phrase) # Now, for every 'replacement term' found in the KB, if it is # not already in the KB as a "search term", add it: for repl_term in repl_terms.keys(): raw_repl_phrase = repl_term.upper() raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase) raw_repl_phrase = \ re_group_captured_multiple_space.sub(u' ', raw_repl_phrase) raw_repl_phrase = raw_repl_phrase.strip() if raw_repl_phrase not in kb: # The replace-phrase was not in the KB as a seek phrase # It should be added. pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase) seek_ptn = re.compile(pattern, re.U) kb[raw_repl_phrase] = seek_ptn standardised_titles[raw_repl_phrase] = repl_term seek_phrases.append(raw_repl_phrase) # Sort the titles by string length (long - short) seek_phrases.sort(_cmp_bystrlen_reverse) write_message('Processed journals kb', verbose=3) # return the raw knowledge base: return kb, standardised_titles, seek_phrases