Esempio n. 1
0
 def test_nonode(self):
     """Nothing sensible to do without a node atm"""
     with self.assertRaises(NotImplementedError):
         to_conc(
             FULL_TEXT,
             [R_A, R_MAN, R_WALKED, R_INTO, R_A2, R_BAR],
             [],
             2
         )
Esempio n. 2
0
 def test_emptycontextsize(self):
     """0 context means we don't return the window lists"""
     self.assertEqual(to_conc(
         FULL_TEXT,
         [R_MAN, R_WALKED, R_INTO],
         [R_MAN, R_WALKED, R_INTO],
         0
     ), [
         ['man', ' ', 'walked', ' ', 'into', [0, 2, 4]],
     ])
Esempio n. 3
0
 def test_windowsplit(self):
     """We split windows on nearest space"""
     self.assertEqual(to_conc(
         FULL_TEXT,
         [R_A, R_MAN, R_WALKED, R_INTO, R_A2, R_BAR],
         [R_MAN, R_WALKED, R_INTO],
         2
     ), [
         ['A', ' ', [0]],
         ['man', ' ', 'walked', ' ', 'into', [0, 2, 4]],
         [' ', 'a', ' ', 'bar', [1, 3]]
     ])
     self.assertEqual(to_conc(
         FULL_TEXT,
         [R_A2, R_BAR, R_OUCH, R_HE, R_SAID],
         [R_OUCH],
         2
     ), [
         ['a', ' ', 'bar', '.', ' ', [0, 2]],
         ['"', 'Ouch!', '",', [1]],
         [' ', 'he', ' ', 'said', [1, 3]],
     ])
Esempio n. 4
0
def subset(cur,
           corpora=['dickens'],
           subset=['all'],
           contextsize=['0'],
           metadata=[]):
    """
    Main entry function for subset search

    - corpora: List of corpora / book names
    - subset: Subset(s) to search for.
    - contextsize: Size of context window, defaults to none.
    - metadata, Array of extra metadata to provide with result, some of
      - 'book_titles' (return dict of book IDs to titles at end of result)
    """
    book_ids = corpora_to_book_ids(cur, corpora)
    if len(book_ids) == 0:
        raise UserError("No books to search", "error")
    contextsize = int(contextsize[0])
    metadata = set(metadata)
    book_cur = cur.connection.cursor()
    book = None
    api_subset = api_subset_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)
    rclass = rclass_id_lookup(cur)

    query = """
        SELECT r.book_id
             , c.full_tokens full_tokens
             , c.is_node is_node
             , r.crange node_crange
             , c.part_of part_of
          FROM region r
          JOIN LATERAL (
              SELECT ARRAY_AGG(t_surrounding.crange ORDER BY t_surrounding.book_id, t_surrounding.ordering) full_tokens
                   , ARRAY_AGG(t_surrounding.crange <@ r.crange ORDER BY t_surrounding.book_id, t_surrounding.ordering) is_node
                   , (ARRAY_AGG(t_surrounding.part_of ORDER BY t_surrounding.book_id, t_surrounding.ordering))[1] part_of
                FROM token t_surrounding
               WHERE t_surrounding.book_id = r.book_id
                 AND t_surrounding.crange <@ range_expand(r.crange, %(contextsize)s)
               ) c ON TRUE
          WHERE r.book_id IN %(book_id)s
           AND r.rclass_id IN %(rclass_ids)s
    """
    params = dict(
        book_id=tuple(book_ids),
        contextsize=contextsize * 10,  # TODO: Bodge word -> char
        rclass_ids=rclass_ids,
    )
    cur.execute(query, params)

    for book_id, full_tokens, is_node, node_crange, part_of in cur:
        node_tokens = [
            crange for crange, include in zip(full_tokens, is_node) if include
        ]
        if len(node_tokens) == 0:
            continue  # Ignore empty suspensions
        if not book or book['id'] != book_id:
            book = get_book(book_cur, book_id, content=True)
        yield to_conc(
            book['content'], full_tokens, node_tokens, contextsize) + [
                [book['name'], node_crange.lower, node_crange.upper],
                [
                    int(part_of.get(str(rclass['chapter.text']), -1)),
                    int(part_of.get(str(rclass['chapter.paragraph']), -1)),
                    int(part_of.get(str(rclass['chapter.sentence']), -1)),
                ]
            ]

    book_cur.close()

    footer = get_book_metadata(cur, book_ids, metadata)
    if footer:
        yield ('footer', footer)