Python sub Examples, invenio.docextract_text.re_group_captured_multiple_space.sub Python Examples

Example #1

0

Show file

def mangle_value(kb, value):
    value = re_punctuation.sub(u' ', value.upper())
    value = re_group_captured_multiple_space.sub(u' ', value)
    value = value.strip()

    standardized_titles = kb[1]
    if value in standardized_titles:
        value = standardized_titles[value]

    return value

Example #2

0

Show file

File: docextract_convert_journals.py Project: aw-bib/tind-invenio

def mangle_value(kb, value):
    value = re_punctuation.sub(u' ', value.upper())
    value = re_group_captured_multiple_space.sub(u' ', value)
    value = value.strip()

    standardized_titles = kb[1]
    if value in standardized_titles:
        value = standardized_titles[value]

    return value

Example #3

0

Show file

def normalize_journal_name(value=None):
    """normalize journal name via knowledgebase lookup"""
    if value is None:
        return ''

    newvalue = re_punctuation.sub(u' ', value.upper())
    newvalue = re_group_captured_multiple_space.sub(u' ', newvalue)
    newvalue = newvalue.strip()

    standardized_titles = get_kbs()['journals'][1]

    return standardized_titles.get(newvalue, value)

Example #4

0

Show file

def build_journals_kb(knowledgebase):
    """Given the path to a knowledge base file, read in the contents
       of that file into a dictionary of search->replace word phrases.
       The search phrases are compiled into a regex pattern object.
       The knowledge base file should consist only of lines that take
       the following format:
         seek-term       ---   replace-term
       (i.e. a seek phrase on the left hand side, a replace phrase on
       the right hand side, with the two phrases being separated by 3
       hyphens.) E.g.:
         ASTRONOMY AND ASTROPHYSICS              ---Astron. Astrophys.

       The left-hand side term is a non-standard version of the title,
       whereas the right-hand side term is the standard version.
       If the KB file cannot be read from, or an unexpected line is
       encountered in the KB, an error
       message is output to standard error and execution is halted with
       an error-code 0.

       @param fpath: (string) the path to the knowledge base file.
       @return: (tuple) containing a list and a dictionary. The list
        contains compiled regex patterns used as search terms and will
        be used to force searching order to match that of the knowledge
        base.
        The dictionary contains the search->replace terms.  The keys of
        the dictionary are the compiled regex word phrases used for
        searching in the reference lines; The values in the dictionary are
        the replace terms for matches.
    """
    # Initialise vars:
    # dictionary of search and replace phrases from KB:
    kb = {}
    standardised_titles = {}
    seek_phrases = []
    # A dictionary of "replacement terms" (RHS) to be inserted into KB as
    # "seek terms" later, if they were not already explicitly added
    # by the KB:
    repl_terms = {}

    write_message('Processing journals kb', verbose=3)
    for seek_phrase, repl in knowledgebase:
        # We match on a simplified line, thus dots are replaced
        # with spaces
        seek_phrase = seek_phrase.replace('.', ' ').decode('utf-8').upper()

        # good KB line
        # Add the 'replacement term' into the dictionary of
        # replacement terms:
        repl_terms[repl] = None

        # add the phrase from the KB if the 'seek' phrase is longer
        # compile the seek phrase into a pattern:
        seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase),
                              re.UNICODE)

        kb[seek_phrase] = seek_ptn
        standardised_titles[seek_phrase] = repl
        seek_phrases.append(seek_phrase)

    # Now, for every 'replacement term' found in the KB, if it is
    # not already in the KB as a "search term", add it:
    for repl_term in repl_terms.keys():
        raw_repl_phrase = repl_term.upper()
        raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = \
             re_group_captured_multiple_space.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = raw_repl_phrase.strip()
        if raw_repl_phrase not in kb:
            # The replace-phrase was not in the KB as a seek phrase
            # It should be added.
            pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase)
            seek_ptn = re.compile(pattern, re.U)
            kb[raw_repl_phrase] = seek_ptn
            standardised_titles[raw_repl_phrase] = repl_term
            seek_phrases.append(raw_repl_phrase)

    # Sort the titles by string length (long - short)
    seek_phrases.sort(_cmp_bystrlen_reverse)

    write_message('Processed journals kb', verbose=3)

    # return the raw knowledge base:
    return kb, standardised_titles, seek_phrases

Example #5

0

Show file

File: refextract_kbs.py Project: BessemAamira/invenio

def build_journals_kb(knowledgebase):
    """Given the path to a knowledge base file, read in the contents
       of that file into a dictionary of search->replace word phrases.
       The search phrases are compiled into a regex pattern object.
       The knowledge base file should consist only of lines that take
       the following format:
         seek-term       ---   replace-term
       (i.e. a seek phrase on the left hand side, a replace phrase on
       the right hand side, with the two phrases being separated by 3
       hyphens.) E.g.:
         ASTRONOMY AND ASTROPHYSICS              ---Astron. Astrophys.

       The left-hand side term is a non-standard version of the title,
       whereas the right-hand side term is the standard version.
       If the KB file cannot be read from, or an unexpected line is
       encountered in the KB, an error
       message is output to standard error and execution is halted with
       an error-code 0.

       @param fpath: (string) the path to the knowledge base file.
       @return: (tuple) containing a list and a dictionary. The list
        contains compiled regex patterns used as search terms and will
        be used to force searching order to match that of the knowledge
        base.
        The dictionary contains the search->replace terms.  The keys of
        the dictionary are the compiled regex word phrases used for
        searching in the reference lines; The values in the dictionary are
        the replace terms for matches.
    """
    # Initialise vars:
    # dictionary of search and replace phrases from KB:
    kb = {}
    standardised_titles = {}
    seek_phrases = []
    # A dictionary of "replacement terms" (RHS) to be inserted into KB as
    # "seek terms" later, if they were not already explicitly added
    # by the KB:
    repl_terms = {}

    write_message('Processing journals kb', verbose=3)
    for seek_phrase, repl in knowledgebase:
        # We match on a simplified line, thus dots are replaced
        # with spaces
        seek_phrase = seek_phrase.replace('.', ' ').upper()

        # good KB line
        # Add the 'replacement term' into the dictionary of
        # replacement terms:
        repl_terms[repl] = None

        # add the phrase from the KB if the 'seek' phrase is longer
        # compile the seek phrase into a pattern:
        seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase),
                              re.UNICODE)

        kb[seek_phrase] = seek_ptn
        standardised_titles[seek_phrase] = repl
        seek_phrases.append(seek_phrase)

    # Now, for every 'replacement term' found in the KB, if it is
    # not already in the KB as a "search term", add it:
    for repl_term in repl_terms.keys():
        raw_repl_phrase = repl_term.upper()
        raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = \
             re_group_captured_multiple_space.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = raw_repl_phrase.strip()
        if raw_repl_phrase not in kb:
            # The replace-phrase was not in the KB as a seek phrase
            # It should be added.
            pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase)
            seek_ptn = re.compile(pattern, re.U)
            kb[raw_repl_phrase] = seek_ptn
            standardised_titles[raw_repl_phrase] = repl_term
            seek_phrases.append(raw_repl_phrase)

    # Sort the titles by string length (long - short)
    seek_phrases.sort(_cmp_bystrlen_reverse)

    write_message('Processed journals kb', verbose=3)

    # return the raw knowledge base:
    return kb, standardised_titles, seek_phrases