Example #1
0
File: htrc.py Project: inpho/vsm
def rm_lb_hyphens(plain_root, logger, ignore=[".json", ".log", ".err"]):
    """
    Looks for a hyphen followed by whitespace or a line break.

    Reconstructs word and checks to see if the result exists in either
    WordNet or the OS's default spellchecker dictionary. If so,
    replaces fragments with reconstructed word.
    
    :param plain_root: The name of the directory containing plain-text 
        files.
    :type plain_root: string
    
    :param logger: Logger that handles logging for the given directory.
    :type logger: Logger
    
    :param ignore: List of file extensions to ignore in the directory.
    :type ignore: list of strings, optional

    :returns: None
    """

    try:
        d = enchant.Dict("en_US")
    except ImportError:
        d = None

    def recon(match_obj):
        rc_word = match_obj.group(1) + match_obj.group(2)

        if wn.synsets(rc_word) or (d and d.check(rc_word)):
            logger.info("\nbook: %s\nreconstructed word:\n%s\n", plain_root, rc_word)
            return rc_word

        logger.info(
            "\nbook: %s\nignored expression:\nleft: %s\nright: %s\n", plain_root, match_obj.group(1), match_obj.group(2)
        )

        return match_obj.group(0)

    def inner(s):
        lb_hyphenated = re.compile(r"(\w+)-\s+(\w+)")
        return lb_hyphenated.sub(recon, s)

    page_files = os.listdir(plain_root)
    page_files = filter_by_suffix(page_files, ignore)

    for i, page_file in enumerate(page_files):
        filename = os.path.join(plain_root, page_file)

        with open(filename, "r+w") as f:
            page = f.read()
            page = inner(page)
            f.seek(0)
            f.write(page)
            f.truncate()
Example #2
0
def rm_lb_hyphens(plain_root, logger, ignore=['.json', '.log', '.err']):
    """
    Looks for a hyphen followed by whitespace or a line break.

    Reconstructs word and checks to see if the result exists in either
    WordNet or the OS's default spellchecker dictionary. If so,
    replaces fragments with reconstructed word.
    
    :param plain_root: The name of the directory containing plain-text 
        files.
    :type plain_root: string
    
    :param logger: Logger that handles logging for the given directory.
    :type logger: Logger
    
    :param ignore: List of file extensions to ignore in the directory.
    :type ignore: list of strings, optional

    :returns: None
    """

    try:
        d = enchant.Dict('en_US')
    except ImportError:
        d = None

    def recon(match_obj):
        rc_word = match_obj.group(1) + match_obj.group(2)
        
        if wn.synsets(rc_word) or (d and d.check(rc_word)):
            logger.info('\nbook: %s\nreconstructed word:\n%s\n',
                         plain_root, rc_word)
            return rc_word
        
        logger.info('\nbook: %s\nignored expression:\nleft: %s\nright: %s\n',
                     plain_root, match_obj.group(1), match_obj.group(2))

        return match_obj.group(0)

    def inner(s):
        lb_hyphenated = re.compile(r'(\w+)-\s+(\w+)')
        return lb_hyphenated.sub(recon, s)
    
    page_files = os.listdir(plain_root)
    page_files = filter_by_suffix(page_files, ignore)

    for i, page_file in enumerate(page_files):
        filename = os.path.join(plain_root, page_file)

        with open(filename, 'r+w') as f:
            page = f.read()
            page = inner(page)
            f.seek(0)
            f.write(page)
            f.truncate()
Example #3
0
def url_metadata(corpus, ctx_type, coll_dir):
    """
    Returns a list of urls whose order matches with the existing metadata.
    It creates url metadata that can be added to a Corpus object with
    add_metadata function in vsm.ext.corpusbuilders.util.

    :param corpus: Corpus to add url metadata to. Urls match with the existing
        metadata of `corpus`.
    :type corpus: Corpus

    :param ctx_type: A type of tokenization.
    :type ctx_type: string

    :param coll_dir: Path for the collection directory. Either htrc 86 plain
        or htrc 1315 plain directory.
    :type coll_dir: string

    :returns: md : List of urls to be added to corpus

    :See Also: :meth: add_metadata
    """

    import json
    from vsm.viewer import doc_label_name
    import re

    urls = []
    corp_md = corpus.view_metadata('book')
    book_labels = corp_md[doc_label_name('book')]

    for book_label in book_labels:
        coll_path = os.path.join(coll_dir, book_label)
        booklist = os.listdir(coll_path)
        book = filter_by_suffix(booklist, ignore=['.txt', '.pickle'])

        book_path = os.path.join(coll_path, book[0])
        with open(book_path, 'r') as f:
            d = json.load(f)
            url = ''
            li = sorted(d['items'], key=lambda k: int(k['lastUpdate']))
            url = li[-1]['itemURL']

            if ctx_type == 'book':
                urls.append(unidecode(url))
            else:  # urls for pages
                page_md = corpus.view_metadata('page')
                files = [
                    a for a in page_md['file'] if a.startswith(book_label)
                ]
                nums = [re.findall('[1-9][0-9]*', a)[-1] for a in files]
                for i in nums:
                    s = url + '?urlappend=%3Bseq={0}'.format(i)
                    urls.append(unidecode(s))
    return urls
Example #4
0
File: htrc.py Project: inpho/vsm
def url_metadata(corpus, ctx_type, coll_dir):
    """
    Returns a list of urls whose order matches with the existing metadata.
    It creates url metadata that can be added to a Corpus object with
    add_metadata function in vsm.ext.corpusbuilders.util.

    :param corpus: Corpus to add url metadata to. Urls match with the existing
        metadata of `corpus`.
    :type corpus: Corpus

    :param ctx_type: A type of tokenization.
    :type ctx_type: string

    :param coll_dir: Path for the collection directory. Either htrc 86 plain
        or htrc 1315 plain directory.
    :type coll_dir: string

    :returns: md : List of urls to be added to corpus

    :See Also: :meth: add_metadata
    """

    import json
    from vsm.viewer import doc_label_name
    import re

    urls = []
    corp_md = corpus.view_metadata("book")
    book_labels = corp_md[doc_label_name("book")]

    for book_label in book_labels:
        coll_path = os.path.join(coll_dir, book_label)
        booklist = os.listdir(coll_path)
        book = filter_by_suffix(booklist, ignore=[".txt", ".pickle"])

        book_path = os.path.join(coll_path, book[0])
        with open(book_path, "r") as f:
            d = json.load(f)
            url = ""
            li = sorted(d["items"], key=lambda k: int(k["lastUpdate"]))
            url = li[-1]["itemURL"]

            if ctx_type == "book":
                urls.append(unidecode(url))
            else:  # urls for pages
                page_md = corpus.view_metadata("page")
                files = [a for a in page_md["file"] if a.startswith(book_label)]
                nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files]
                for i in nums:
                    s = url + "?urlappend=%3Bseq={0}".format(i)
                    urls.append(unidecode(s))
    return urls
Example #5
0
File: htrc.py Project: inpho/vsm
def proc_htrc_coll(coll_dir, ignore=[".json", ".log", ".err"]):
    """
    Given a collection, cleans up plain page files for books in the collection.

    :param coll_dir: The path for collection.
    :type coll_dir: string
    
    :param ignore: List of file extensions to ignore in the directory.
    :type ignore: list of strings, optional

    :returns: None

    :See Also: :meth: proc_htrc_book
    """
    books = os.listdir(coll_dir)
    books = filter_by_suffix(books, ignore)

    for book in books:
        # For debugging
        # if book == 'uc2.ark+=13960=t1zc80k1p':
        # if book == 'uc2.ark+=13960=t8tb11c8g':
        # if book == 'uc2.ark+=13960=t74t6gz6r':
        proc_htrc_book(book, coll_dir, ignore=ignore)
Example #6
0
def proc_htrc_coll(coll_dir, ignore=['.json', '.log', '.err']):
    """
    Given a collection, cleans up plain page files for books in the collection.

    :param coll_dir: The path for collection.
    :type coll_dir: string
    
    :param ignore: List of file extensions to ignore in the directory.
    :type ignore: list of strings, optional

    :returns: None

    :See Also: :meth: proc_htrc_book
    """
    books = os.listdir(coll_dir)
    books = filter_by_suffix(books, ignore)
    
    for book in books:
        # For debugging
        # if book == 'uc2.ark+=13960=t1zc80k1p':
        # if book == 'uc2.ark+=13960=t8tb11c8g':
        # if book == 'uc2.ark+=13960=t74t6gz6r':
        proc_htrc_book(book, coll_dir, ignore=ignore)
Example #7
0
File: htrc.py Project: inpho/vsm
def rm_pg_headers(plain_root, logger, bound=1, ignore=[".json", ".log", ".err"]):
    """
    Tries to detect repeated page headers (e.g., chapter titles). If
    found, removes them.

    The routine takes the first non-empty lines of text, strips them
    of numbers and punctuation and computes frequencies. If frequency
    for the reduced string exceeds `bound`, the corresponding first
    lines are considered headers.
    
    :param plain_root: The name of the directory containing plain-text 
        files.
    :type plain_root: string
    
    :param logger: Logger that handles logging for the given directory.
    :type logger: Logger
    
    :param bound: Number of frequency of a reduced string. If the string
        appears more than `bound`, then the corresponding first lines are
        considered headers. Default is 1.
    :param bound: int, optional

    :param ignore: List of file extensions to ignore in the directory.
    :type ignore: list of strings, optional

    :returns: None
    """
    page_files = os.listdir(plain_root)
    page_files = filter_by_suffix(page_files, ignore)

    # Get first non-empty lines
    first_lines = []
    fl = re.compile(r"^\s*([^\n]*)\n")

    for page_file in page_files:
        page_file = os.path.join(plain_root, page_file)

        with open(page_file, "r") as f:
            page = f.read()

        first_line = fl.match(page)
        if first_line == None:
            first_lines.append("")
        else:
            first_lines.append(first_line.group(0))

    # Remove capitalization, roman numerals for numbers under 50,
    # punctuation, arabic numerals from first lines
    for i in xrange(len(first_lines)):
        line = first_lines[i]
        line = line.lower()

        # An overzealous arabic numeral detector (OCR errors include
        # `i` for `1` for example)
        line = re.sub(r"\b\S*\d+\S*\b", "", line)

        # Roman numerals i to xxxix
        line = re.sub(r"\b(x{0,3})(ix|iv|v?i{0,3})\b", "", line)

        # Collapse line to letters only
        line = re.sub(r"[^a-z]", "", line)
        first_lines[i] = (first_lines[i], line)

    freqs = dict()
    for line, reduced in first_lines:
        if reduced in freqs:
            freqs[reduced] += 1
        else:
            freqs[reduced] = 1

    for i, page_file in enumerate(page_files):
        filename = os.path.join(plain_root, page_file)
        line, reduced = first_lines[i]

        if freqs[reduced] > bound:
            with open(filename, "r") as f:
                page = f.read()
            if page:
                logger.info(
                    u"\nbook: %s\nfile: %s\nremoved header:\n%s\n",
                    unidecode(plain_root),
                    unidecode(page_file),
                    unidecode(line),
                )
            page = fl.sub("", page)

            with open(filename, "w") as f:
                f.write(page)
Example #8
0
def rm_pg_headers(plain_root, logger, bound=1, ignore=['.json', '.log', '.err']):
    """
    Tries to detect repeated page headers (e.g., chapter titles). If
    found, removes them.

    The routine takes the first non-empty lines of text, strips them
    of numbers and punctuation and computes frequencies. If frequency
    for the reduced string exceeds `bound`, the corresponding first
    lines are considered headers.
    
    :param plain_root: The name of the directory containing plain-text 
        files.
    :type plain_root: string
    
    :param logger: Logger that handles logging for the given directory.
    :type logger: Logger
    
    :param bound: Number of frequency of a reduced string. If the string
        appears more than `bound`, then the corresponding first lines are
        considered headers. Default is 1.
    :param bound: int, optional

    :param ignore: List of file extensions to ignore in the directory.
    :type ignore: list of strings, optional

    :returns: None
    """
    page_files = os.listdir(plain_root)
    page_files = filter_by_suffix(page_files, ignore)

    # Get first non-empty lines
    first_lines = []
    fl = re.compile(r'^\s*([^\n]*)\n')
    
    for page_file in page_files:
        page_file = os.path.join(plain_root, page_file)

        with open(page_file, 'r') as f:
            page = f.read()

        first_line = fl.match(page)
        if first_line == None:
            first_lines.append('')
        else:
            first_lines.append(first_line.group(0))

    # Remove capitalization, roman numerals for numbers under 50,
    # punctuation, arabic numerals from first lines
    for i in range(len(first_lines)):
        line = first_lines[i]
        line = line.lower()

        # An overzealous arabic numeral detector (OCR errors include
        # `i` for `1` for example)
        line = re.sub(r'\b\S*\d+\S*\b', '', line)

        # Roman numerals i to xxxix
        line = re.sub(r'\b(x{0,3})(ix|iv|v?i{0,3})\b', '', line)

        # Collapse line to letters only
        line = re.sub(r'[^a-z]', '', line)
        first_lines[i] = (first_lines[i], line)

    freqs = dict()
    for line, reduced in first_lines:
        if reduced in freqs:
            freqs[reduced] += 1
        else:
            freqs[reduced] = 1
    
    for i, page_file in enumerate(page_files):
        filename = os.path.join(plain_root, page_file)
        line, reduced = first_lines[i]

        if freqs[reduced] > bound:
            with open(filename, 'r') as f:
                page = f.read()
            if page:
                logger.info(u'\nbook: %s\nfile: %s\nremoved header:\n%s\n',
                             unidecode(plain_root), unidecode(page_file), unidecode(line))
            page = fl.sub('', page)

            with open(filename, 'w') as f:
                f.write(page)