Ejemplo n.º 1
0
def text(cur, corpora=[], regions=[]):
    book_ids = tuple(corpora_to_book_ids(cur, corpora))
    if len(book_ids) == 0:
        raise UserError("No books to search", "error")
    rclass = rclass_id_lookup(cur)
    rclass_ids = tuple(rclass[name] for name in regions)

    if len(book_ids) > 1:
        raise UserError("Multiple books not supported", "error")

    for book_id in book_ids:
        yield ('header', {
            'content': get_book(cur, book_id, content=True)['content']
        })

    cur.execute(
        """
        SELECT (SELECT name FROM rclass WHERE rclass_id = r.rclass_id) rclass_name
             , r.crange
             , r.rvalue
          FROM region r
         WHERE r.book_id IN %(book_ids)s
           AND r.rclass_id IN %(rclass_ids)s
    """, dict(
            book_ids=book_ids,
            rclass_ids=rclass_ids,
        ))

    for rclass_name, crange, rvalue in cur:
        yield [rclass_name, crange.lower, crange.upper, rvalue]
Ejemplo n.º 2
0
def put_book(cur, book):
    """
    Import a book object:

    * name: The shortname of the book
    * content: The full book string, as per instructions in the corpora repository
    * An entry for each rclass, e.g. "chapter.text": See /schema/10-rclass.sql. A list of...
      - off_start: Character offset for start of this region
      - off_end: Character offset for end of this region, non-inclusive
      - rvalue: rvalue, e.g. chapter number (see /schema/10-rclass.sql)

    The book contents / regions will be imported into the database, and any
    "chapter.text" region will be tokenised.
    """
    rclass = rclass_id_lookup(cur)

    # Insert book / update content, get ID for other updates
    cur.execute(
        """
        SELECT book_id, token_tbl, region_tbl FROM book_import_init(%(name)s, %(content)s)
    """, dict(
            name=book['name'],
            content=book['content'],
        ))
    (book_id, token_tbl, region_tbl) = cur.fetchone()

    # Replace regions with new values
    for rclass_name in book.keys():
        if '.' not in rclass_name:
            continue  # Not an rclass
        rclass_id = rclass[rclass_name]
        psycopg2.extras.execute_values(
            cur, """
            INSERT INTO """ + region_tbl +
            """ (book_id, crange, rclass_id, rvalue) VALUES %s
        """, ((
                book_id,
                psycopg2.extras.NumericRange(off_start, off_end),
                rclass_id,
                rvalues[0] if len(rvalues) > 0 else None,
            ) for off_start, off_end, *rvalues in book[rclass_name]))

    # Tokenise each chapter text region and add it to the database
    psycopg2.extras.execute_values(
        cur, """
        INSERT INTO """ + token_tbl + """ (book_id, crange, ttype) VALUES %s
    """, ((
            book_id,
            psycopg2.extras.NumericRange(off_start, off_end),
            ttype,
        ) for ttype, off_start, off_end in types_from_string(book['content'],
                                                             offset=0)))

    # Finalise token import, let DB update metadata, indexes
    cur.execute(
        """
        SELECT * FROM book_import_finalise(%(book_id)s)
    """, dict(book_id=book_id, ))
Ejemplo n.º 3
0
def corpora(cur):
    """
    Return a list of dicts containing:
    - id: corpus short name
    - title: corpus title
    - children: [{id: book id, title: book title, author: book author}, ...]
    """
    rclass = rclass_id_lookup(cur)

    cur.execute("""
        SELECT c.name c_name
             , c.title c_title
             , (SELECT b.name FROM book b WHERE b.book_id = cb.book_id) b_name
             , MAX(CASE WHEN bm.rclass_id = %(rclass_title)s THEN bm.content ELSE NULL END) AS title
             , MAX(CASE WHEN bm.rclass_id = %(rclass_author)s THEN bm.content ELSE NULL END) AS author
          FROM corpus c, corpus_book cb, book_metadata bm
         WHERE c.corpus_id = cb.corpus_id
           AND cb.book_id = bm.book_id
           AND bm.rclass_id IN (%(rclass_title)s, %(rclass_author)s)
      GROUP BY c.corpus_id, cb.book_id
      ORDER BY c.ordering, c.title, b_name
    """, dict(
        rclass_title=rclass['metadata.title'],
        rclass_author=rclass['metadata.author'],
    ))

    out = []
    author_book_count = {}
    for (c_id, c_title, b_id, b_title, b_author) in cur:
        if len(out) == 0 or out[-1]['id'] != 'corpus:%s' % c_id:
            out.append(dict(id='corpus:%s' % c_id, title=c_title, children=[]))
        out[-1]['children'].append(dict(id=b_id, title=b_title, author=b_author))

        # Add to the book count for this author
        if b_author not in author_book_count:
            author_book_count[b_author] = 0
        author_book_count[b_author] += 1

    out.append(dict(id=None, title='All books by author', children=[]))
    for author in sorted(author_book_count.keys()):
        out[-1]['children'].append(dict(
            id='author:%s' % author,
            title=author,
            author='%d books' % author_book_count[author],  # NB: Just doing this to get it into brackets, ew.
        ))

    return dict(corpora=out, aliases=OLD_ALIASES)
Ejemplo n.º 4
0
def corpora_headlines(cur):
    """
    Return a list of dicts containing:
    - id: corpus short name
    - title: corpus title
    - book_count: Number of books in corpus
    - word_count: Number of words in corpus
    """
    rclass = rclass_id_lookup(cur)

    cur.execute("""
        SELECT c.name
             , c.title
             , COUNT(*) book_count
             , SUM((
                 SELECT SUM(word_count)
                   FROM book_word_count bwc
                  WHERE bwc.book_id = cb.book_id
                    AND rclass_id = %(ch_text)s
               ))::INT word_count
          FROM corpus c, corpus_book cb
         WHERE c.corpus_id = cb.corpus_id
      GROUP BY c.corpus_id
      ORDER BY c.ordering, c.title
    """, dict(
        ch_text=rclass['chapter.text'],
    ))

    out = []
    for (c_id, c_title, book_count, word_count) in cur:
        out.append(dict(
            id='corpus:%s' % c_id,
            title=c_title,
            book_count=book_count,
            word_count=word_count,
        ))

    return dict(data=out)
Ejemplo n.º 5
0
def subset(cur,
           corpora=['dickens'],
           subset=['all'],
           contextsize=['0'],
           metadata=[]):
    """
    Main entry function for subset search

    - corpora: List of corpora / book names
    - subset: Subset(s) to search for.
    - contextsize: Size of context window, defaults to none.
    - metadata, Array of extra metadata to provide with result, some of
      - 'book_titles' (return dict of book IDs to titles at end of result)
    """
    book_ids = corpora_to_book_ids(cur, corpora)
    if len(book_ids) == 0:
        raise UserError("No books to search", "error")
    contextsize = int(contextsize[0])
    metadata = set(metadata)
    book_cur = cur.connection.cursor()
    book = None
    api_subset = api_subset_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)
    rclass = rclass_id_lookup(cur)

    query = """
        SELECT r.book_id
             , c.full_tokens full_tokens
             , c.is_node is_node
             , r.crange node_crange
             , c.part_of part_of
          FROM region r
          JOIN LATERAL (
              SELECT ARRAY_AGG(t_surrounding.crange ORDER BY t_surrounding.book_id, t_surrounding.ordering) full_tokens
                   , ARRAY_AGG(t_surrounding.crange <@ r.crange ORDER BY t_surrounding.book_id, t_surrounding.ordering) is_node
                   , (ARRAY_AGG(t_surrounding.part_of ORDER BY t_surrounding.book_id, t_surrounding.ordering))[1] part_of
                FROM token t_surrounding
               WHERE t_surrounding.book_id = r.book_id
                 AND t_surrounding.crange <@ range_expand(r.crange, %(contextsize)s)
               ) c ON TRUE
          WHERE r.book_id IN %(book_id)s
           AND r.rclass_id IN %(rclass_ids)s
    """
    params = dict(
        book_id=tuple(book_ids),
        contextsize=contextsize * 10,  # TODO: Bodge word -> char
        rclass_ids=rclass_ids,
    )
    cur.execute(query, params)

    for book_id, full_tokens, is_node, node_crange, part_of in cur:
        node_tokens = [
            crange for crange, include in zip(full_tokens, is_node) if include
        ]
        if len(node_tokens) == 0:
            continue  # Ignore empty suspensions
        if not book or book['id'] != book_id:
            book = get_book(book_cur, book_id, content=True)
        yield to_conc(
            book['content'], full_tokens, node_tokens, contextsize) + [
                [book['name'], node_crange.lower, node_crange.upper],
                [
                    int(part_of.get(str(rclass['chapter.text']), -1)),
                    int(part_of.get(str(rclass['chapter.paragraph']), -1)),
                    int(part_of.get(str(rclass['chapter.sentence']), -1)),
                ]
            ]

    book_cur.close()

    footer = get_book_metadata(cur, book_ids, metadata)
    if footer:
        yield ('footer', footer)
Ejemplo n.º 6
0
def concordance(cur,
                corpora=['dickens'],
                subset=['all'],
                q=[],
                contextsize=['0'],
                metadata=[]):
    """
    Main entry function for concordance search

    - corpora: List of corpora / book names
    - subset: Subset to search within, or 'all'
    - q: Quer(ies) to search for, results will contain one of the given expressions
    - contextsize: Size of context window, defaults to none.
    - metadata, Array of extra metadata to provide with result, some of
      - 'book_titles' (return dict of book IDs to titles at end of result)
    """
    book_ids = tuple(corpora_to_book_ids(cur, corpora))
    if len(book_ids) == 0:
        raise UserError("No books to search", "error")
    api_subset = api_subset_lookup(cur)
    rclass = rclass_id_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)
    if len(rclass_ids) != 1:
        raise UserError("You must supply exactly one subset", "error")
    like_sets = [parse_query(s) for s in q]
    if len(like_sets) == 0:
        raise UserError("You must supply at least one search term", "error")
    contextsize = int(contextsize[0])
    metadata = set(metadata)
    book = None

    book_cur = cur.connection.cursor()
    try:
        for likes in like_sets:
            # Choose an "anchor". We search for this first to narrow the possible
            # outputs as much as possible, then consider the types around each.
            anchor_offset = find_anchor_offset(*likes)

            query = ""
            params = dict()
            query += """
                 SELECT t.book_id
                      , c.node_start - 1 node_start -- NB: Postgres is 1-indexed
                      , c.cranges full_tokens
                      , t.part_of
                   FROM token t
                   JOIN LATERAL ( -- i.e. for each valid anchor token, get all tokens around it, including context
                       SELECT ARRAY_POSITION(ARRAY_AGG(t_surrounding.ordering = t.ordering ORDER BY book_id, ordering), TRUE) - %(anchor_offset)s node_start
                            , ARRAY_AGG(CASE WHEN t_surrounding.ordering < (t.ordering - %(anchor_offset)s) THEN t_surrounding.ttype -- i.e. part of the context, so rclass irrelevant
                                             WHEN t_surrounding.ordering > (t.ordering - %(anchor_offset)s + %(total_likes)s - 1) THEN t_surrounding.ttype -- i.e. part of the context, so rclass irrelevant
                                             WHEN t_surrounding.part_of ? %(part_of)s THEN t_surrounding.ttype
                                             ELSE NULL -- part of the node, but not in the right rclass, NULL should fail any node checks later on
                                              END ORDER BY book_id, ordering) ttypes
                            , ARRAY_AGG(t_surrounding.crange ORDER BY book_id, ordering) cranges
                         FROM token t_surrounding
                        WHERE t_surrounding.book_id = t.book_id
                          AND t_surrounding.ordering BETWEEN t.ordering - %(anchor_offset)s - %(contextsize)s
                                             AND t.ordering - %(anchor_offset)s + (%(total_likes)s - 1) + %(contextsize)s
                   ) c on TRUE
                 WHERE t.book_id IN %(book_ids)s
                   AND t.part_of ? %(part_of)s
            """
            params['anchor_offset'] = anchor_offset
            params['anchor_like'] = likes[anchor_offset]
            params['book_ids'] = book_ids
            params['contextsize'] = contextsize
            params['total_likes'] = len(likes)
            params['part_of'] = str(rclass_ids[0])

            for i, l in enumerate(likes):
                if i == anchor_offset:
                    # We should check the main token table for the anchor node, so
                    # postgres searches for this first
                    query += "AND t.ttype LIKE %(like_" + str(i) + ")s\n"
                else:
                    query += "AND c.ttypes[c.node_start + " + str(
                        i) + "] LIKE %(like_" + str(i) + ")s\n"
                params["like_" + str(i)] = l

            cur.execute(query, params)
            for book_id, node_start, full_tokens, part_of in cur:
                # Extract portion of tokens that are the node
                node_tokens = full_tokens[node_start:node_start + len(likes)]
                if not book or book['id'] != book_id:
                    book = get_book(book_cur, book_id, content=True)
                yield to_conc(
                    book['content'], full_tokens, node_tokens, contextsize
                ) + [[
                    book['name'], node_tokens[0].lower, node_tokens[-1].upper
                ],
                     [
                         int(part_of.get(str(rclass['chapter.text']), -1)),
                         int(part_of.get(str(rclass['chapter.paragraph']),
                                         -1)),
                         int(part_of.get(str(rclass['chapter.sentence']), -1)),
                     ]]
    finally:
        book_cur.close()

    footer = get_book_metadata(cur, book_ids, metadata)
    if footer:
        yield ('footer', footer)
Ejemplo n.º 7
0
def get_book_metadata(cur, book_ids, metadata):
    """
    Generate dict of metadata that should go in footer of both concordance and subsets

    - book_ids: Array of book IDs to include
    - metadata: Metadata items to include, a set contining some of...
      - 'book_titles': The title / author of each book
      - 'chapter_start': The start character for all chapters, and end of book
      - 'word_count_(subset)': Count of words within (subset)
    """
    def p_params(*args):
        return ("?, " * sum(len(x) for x in args)).rstrip(', ')

    rclass = rclass_id_lookup(cur)

    out = {}
    for k in metadata:
        out[k] = {}

        if k == 'book_titles':
            cur.execute(
                """
                SELECT b.name
                     , bm.rclass_id
                     , bm.content
                  FROM book b, book_metadata bm
                 WHERE b.book_id = bm.book_id
                   AND b.book_id IN %s
                   AND bm.rclass_id IN %s
            """, (
                    tuple(book_ids),
                    (rclass['metadata.title'], rclass['metadata.author']),
                ))
            for (book_name, rclass_id, content) in cur:
                if book_name not in out[k]:
                    out[k][book_name] = [None, None]
                out[k][book_name][0 if rclass_id ==
                                  rclass['metadata.title'] else 1] = content

        elif k == 'chapter_start':
            cur.execute(
                """
                SELECT b.name
                     , r.rvalue as chapter_num
                     , r.crange crange
                  FROM book b, region r
                 WHERE b.book_id = r.book_id
                   AND r.rclass_id = %s
                   AND b.book_id IN %s
            """, (
                    rclass['chapter.text'],
                    tuple(book_ids),
                ))
            for (book_name, chapter_num, crange) in cur:
                if book_name not in out[k]:
                    out[k][book_name] = dict()
                out[k][book_name][chapter_num] = crange.lower
                out[k][book_name]['_end'] = max(
                    out[k][book_name].get('_end', 0), crange.upper)

        elif k == 'word_count_chapter':
            cur.execute(
                """
                SELECT b.name
                     , bwc.rvalue as chapter_num
                     , bwc.word_count
                  FROM book b, book_word_count bwc
                 WHERE b.book_id = bwc.book_id
                   AND bwc.rclass_id = %s
                   AND b.book_id IN %s
              ORDER BY bwc.book_id, bwc.rvalue
            """, (
                    rclass['chapter.text'],
                    tuple(book_ids),
                ))
            for (book_name, chapter_num, word_total) in cur:
                if book_name not in out[k]:
                    out[k][book_name] = dict(_end=0)
                out[k][book_name][chapter_num] = out[k][book_name]['_end']
                out[k][book_name]['_end'] += int(word_total)

        elif k.startswith('word_count_'):
            api_subset = api_subset_lookup(cur)
            cur.execute(
                """
                SELECT b.name
                     , SUM(bwc.word_count) AS word_count
                  FROM book b, book_word_count bwc
                 WHERE b.book_id = bwc.book_id
                   AND bwc.rclass_id = %s
                   AND b.book_id IN %s
              GROUP BY b.book_id
            """, (
                    api_subset[k.replace('word_count_', '')],
                    tuple(book_ids),
                ))
            for (book_name, word_count) in cur:
                out[k][book_name] = int(word_count)

        else:
            raise ValueError("Unknown metadata item %s" % k)

    return out