Example #1
0
def wash_and_repair_reference_line(line):
    """Wash a reference line of undesirable characters (such as poorly-encoded
       letters, etc), and repair any errors (such as broken URLs) if possible.
       @param line: (string) the reference line to be washed/repaired.
       @return: (string) the washed reference line.
    """
    # repair URLs in line:
    line = repair_broken_urls(line)
    # Replace various undesirable characters with their alternatives:
    line = replace_undesirable_characters(line)
    # Replace "<title>," with "<title>",
    # common typing mistake
    line = re.sub(ur'"([^"]+),"', ur'"\g<1>",', line)
    line = replace_undesirable_characters(line)
    # Remove instances of multiple spaces from line, replacing with a
    # single space:
    line = re_multiple_space.sub(u' ', line)
    return line
Example #2
0
def wash_and_repair_reference_line(line):
    """Wash a reference line of undesirable characters (such as poorly-encoded
       letters, etc), and repair any errors (such as broken URLs) if possible.
       @param line: (string) the reference line to be washed/repaired.
       @return: (string) the washed reference line.
    """
    # repair URLs in line:
    line = repair_broken_urls(line)
    # Replace various undesirable characters with their alternatives:
    line = replace_undesirable_characters(line)
    # Replace "<title>," with "<title>",
    # common typing mistake
    line = re.sub(ur'"([^"]+),"', ur'"\g<1>",', line)
    line = replace_undesirable_characters(line)
    # Remove instances of multiple spaces from line, replacing with a
    # single space:
    line = re_multiple_space.sub(u" ", line)
    return line
Example #3
0
def normalize_fulltext(fulltext):
    """Returns a 'cleaned' version of the output provided by pdftotext."""
    # We recognize keywords by the spaces. We need these to match the
    # first and last words of the document.
    fulltext = " " + fulltext + " "

    # Replace some weird unicode characters.
    fulltext = replace_undesirable_characters(fulltext)
    # Replace the greek characters by their name.
    fulltext = _replace_greek_characters(fulltext)

    washing_regex = get_washing_regex()

    # Apply the regular expressions to the fulltext.
    for regex, replacement in washing_regex:
        fulltext = regex.sub(replacement, fulltext)

    return fulltext
def normalize_fulltext(fulltext):
    """Returns a 'cleaned' version of the output provided by pdftotext."""
    # We recognize keywords by the spaces. We need these to match the
    # first and last words of the document.
    fulltext = " " + fulltext + " "

    # Replace some weird unicode characters.
    fulltext = replace_undesirable_characters(fulltext)
    # Replace the greek characters by their name.
    fulltext = _replace_greek_characters(fulltext)

    washing_regex = get_washing_regex()

    # Apply the regular expressions to the fulltext.
    for regex, replacement in washing_regex:
        fulltext = regex.sub(replacement, fulltext)

    return fulltext