def test_insert_static_data_4(self): query_string = "item1 item2 item3" query = TokenQuery(query_string, self.session) query._query_id = 999 options.cfg.selected_features = ["coquery_query_token"] df = query.insert_static_data(self.df) self.assertListEqual( sorted(df.columns.tolist()), sorted([ "coq_word_label_1", "coq_word_label_2", "coq_word_label_3", "coquery_invisible_corpus_id", "coquery_dummy", "coquery_invisible_query_id", "coquery_query_token_1", "coquery_query_token_2", "coquery_query_token_3" ])) self.assertListEqual(df.coquery_query_token_1.tolist(), ["item1"] * len(df)) self.assertListEqual(df.coquery_query_token_2.tolist(), ["item2"] * len(df)) self.assertListEqual(df.coquery_query_token_3.tolist(), ["item3"] * len(df))
def test_insert_static_data_2(self): query = TokenQuery("item1{0,3} item2{1,3} item3{2,3}", self.session) query._query_id = 999 df = query.insert_static_data(self.df) self.assertListEqual(df.columns.tolist(), [ "coq_word_label_1", "coq_word_label_2", "coq_word_label_3", "coquery_invisible_corpus_id", "coquery_dummy", "coquery_invisible_query_id" ])
def test_insert_static_data_2(self): query = TokenQuery("item1{0,3} item2{1,3} item3{2,3}", self.session) query._query_id = 999 df = query.insert_static_data(self.df) self.assertListEqual( df.columns.tolist(), ["coq_word_label_1", "coq_word_label_2", "coq_word_label_3", "coquery_invisible_corpus_id", "coquery_dummy", "coquery_invisible_query_id"])
def test_quantified_required_columns(self): ext_feature = "{}.word_data".format(self.link.get_hash()) s = "happy to{0,1} [n*]" query = TokenQuery(s, self.Session) self.assertTrue(len(query.query_list) == 2) l = self.resource.get_corpus_joins(query.query_list[0]) # 1 2 3 # happy {to} [n*] l = self.resource.get_required_columns(query.query_list[0], ["word_label", ext_feature]) self.assertListEqual(l, ["COQ_WORD_1.Word AS coq_word_label_1", "NULL AS coq_word_label_2", "COQ_WORD_3.Word AS coq_word_label_3", "EXTCORP_LEXICON_1.ExtData AS db_extcorp_coq_word_data_1", "NULL AS db_extcorp_coq_word_data_2", "EXTCORP_LEXICON_3.ExtData AS db_extcorp_coq_word_data_3", "COQ_CORPUS_1.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_1.FileId AS coquery_invisible_origin_id"]) l = self.resource.get_required_columns(query.query_list[1], ["word_label", ext_feature]) self.assertListEqual(l, ["COQ_WORD_1.Word AS coq_word_label_1", "COQ_WORD_2.Word AS coq_word_label_2", "COQ_WORD_3.Word AS coq_word_label_3", "EXTCORP_LEXICON_1.ExtData AS db_extcorp_coq_word_data_1", "EXTCORP_LEXICON_2.ExtData AS db_extcorp_coq_word_data_2", "EXTCORP_LEXICON_3.ExtData AS db_extcorp_coq_word_data_3", "COQ_CORPUS_1.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_1.FileId AS coquery_invisible_origin_id"])
def test_get_required_columns_4(self): query = TokenQuery("*", self.Session) l = self.resource.get_required_columns(query.query_list[0], ["lemma_label"]) self.assertListEqual(l, ["COQ_LEMMA_1.Lemma AS coq_lemma_label_1", "COQ_CORPUS_1.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_1.FileId AS coquery_invisible_origin_id"])
def test_get_required_columns_1(self): query = TokenQuery("*", self.Session) s = self.resource.get_required_columns(query.query_list[0], ["word_label"]) self.assertListEqual(s, ["COQ_WORD_1.Word AS coq_word_label_1", "COQ_CORPUS_1.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_1.FileId AS coquery_invisible_origin_id"])
def test_where_conditions_2(self): query = TokenQuery("*'ll", self.Session) join_list = self.resource.get_corpus_joins(query.query_list[0]) l = self.resource.get_condition_list(query.query_list[0], join_list, ["word_label"]) self.assertListEqual(l, ["(COQ_WORD_1.Word LIKE '%''ll')"])
def test_insert_static_data_3(self): query_string = "item1 item2 item3" query = TokenQuery(query_string, self.session) query._query_id = 999 options.cfg.selected_features = ["word_label", "coquery_query_string"] df = query.insert_static_data(self.df) self.assertListEqual( sorted(df.columns.tolist()), sorted(["coq_word_label_1", "coq_word_label_2", "coq_word_label_3", "coquery_invisible_corpus_id", "coquery_dummy", "coquery_invisible_query_id", "coquery_query_string"])) self.assertListEqual( df.coquery_query_string.tolist(), [query_string] * len(df))
def test_linked_required_columns(self): query = TokenQuery("*", self.Session) ext_feature = "{}.word_data".format(self.link.get_hash()) l = self.resource.get_required_columns(query.query_list[0], [ext_feature]) self.assertListEqual(l, ["EXTCORP_LEXICON_1.ExtData AS db_extcorp_coq_word_data_1", "COQ_CORPUS_1.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_1.FileId AS coquery_invisible_origin_id"])
def test_get_required_columns_NULL_1(self): # tests issue #256 query = TokenQuery("_NULL *", self.Session) l = self.resource.get_required_columns(query.query_list[0], ["word_label"]) self.assertListEqual(l, ["NULL AS coq_word_label_1", "COQ_WORD_2.Word AS coq_word_label_2", "COQ_CORPUS_2.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_2.FileId AS coquery_invisible_origin_id"])
def test_corpus_joins_optimized_order_1(self): """ Three query items, join order optimized by query item complexity. """ query = TokenQuery("* *ier [n*]", self.Session) l = self.resource.get_corpus_joins(query.query_list[0]) self.maxDiff = None self.assertListEqual(l, ["FROM Corpus AS COQ_CORPUS_2", "INNER JOIN Corpus AS COQ_CORPUS_3 ON COQ_CORPUS_3.ID = COQ_CORPUS_2.ID + 1", "INNER JOIN Corpus AS COQ_CORPUS_1 ON COQ_CORPUS_1.ID = COQ_CORPUS_2.ID - 1"])
def test_get_required_columns_3(self): query = TokenQuery("* *", self.Session) l = self.resource.get_required_columns(query.query_list[0], ["source_label", "word_label", "word_pos"]) self.assertListEqual(l, ["COQ_WORD_1.Word AS coq_word_label_1", "COQ_WORD_2.Word AS coq_word_label_2", "COQ_WORD_1.POS AS coq_word_pos_1", "COQ_WORD_2.POS AS coq_word_pos_2", "COQ_SOURCE_1.Title AS coq_source_label_1", "COQ_CORPUS_1.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_1.FileId AS coquery_invisible_origin_id"])
def test_get_token_numbering_2(self): query = TokenQuery("item1{0,3} item2{1,3} item3{2,3}", self.session) self.assertEqual(query.get_token_numbering(0), "1.1") self.assertEqual(query.get_token_numbering(1), "1.2") self.assertEqual(query.get_token_numbering(2), "1.3") self.assertEqual(query.get_token_numbering(3), "2.1") self.assertEqual(query.get_token_numbering(4), "2.2") self.assertEqual(query.get_token_numbering(5), "2.3") self.assertEqual(query.get_token_numbering(6), "3.1") self.assertEqual(query.get_token_numbering(7), "3.2") self.assertEqual(query.get_token_numbering(8), "3.3")
def test_where_conditions_quantified(self): s = "more * than [dt]{0,1} [jj]{0,3} [nn*]{1,2}" # 1 2 3 4 5 6 7 8 9 # more * than {NONE} {NONE NONE NONE} {[nn*] NONE} query = TokenQuery(s, self.Session) join_list = self.resource.get_corpus_joins(query.query_list[0]) l = self.resource.get_condition_list(query.query_list[0], join_list, ["word_label"]) self.assertListEqual(l, ["(COQ_WORD_1.Word = 'more')", "(COQ_WORD_3.Word = 'than')", "(COQ_WORD_8.POS LIKE 'nn%')"])
def test_query_string_apostrophe(self): query = TokenQuery("*'ll", self.Session) query_string = self.resource.get_query_string( query.query_list[0], ["word_label"]) target_string = """ SELECT COQ_WORD_1.Word AS coq_word_label_1, COQ_CORPUS_1.ID AS coquery_invisible_corpus_id, COQ_CORPUS_1.FileId AS coquery_invisible_origin_id FROM Corpus AS COQ_CORPUS_1 INNER JOIN Lexicon AS COQ_WORD_1 ON COQ_WORD_1.WordId = COQ_CORPUS_1.WordId WHERE (COQ_WORD_1.Word LIKE '%''ll')""" self.assertEqual(self.simple(query_string), self.simple(target_string))
def test_quantified_query_string_1(self): query = TokenQuery("* b*{1,2} *", self.Session) self.assertTrue(len(query.query_list) == 2) l = self.resource.get_corpus_joins(query.query_list[0]) self.assertListEqual(l, ["FROM Corpus AS COQ_CORPUS_2", "INNER JOIN Corpus AS COQ_CORPUS_1 ON COQ_CORPUS_1.ID = COQ_CORPUS_2.ID - 1", "INNER JOIN Corpus AS COQ_CORPUS_4 ON COQ_CORPUS_4.ID = COQ_CORPUS_2.ID + 1"]) l = self.resource.get_corpus_joins(query.query_list[1]) self.assertListEqual(l, ["FROM Corpus AS COQ_CORPUS_2", "INNER JOIN Corpus AS COQ_CORPUS_3 ON COQ_CORPUS_3.ID = COQ_CORPUS_2.ID + 1", "INNER JOIN Corpus AS COQ_CORPUS_1 ON COQ_CORPUS_1.ID = COQ_CORPUS_2.ID - 1", "INNER JOIN Corpus AS COQ_CORPUS_4 ON COQ_CORPUS_4.ID = COQ_CORPUS_2.ID + 2"])
def test_query_string_ortho_or_with_pos(self): query = TokenQuery("a*|b*.[n*]", self.Session) query_string = self.resource.get_query_string(query.query_list[0], ["word_label"]) target_string = """ SELECT COQ_WORD_1.Word AS coq_word_label_1, COQ_CORPUS_1.ID AS coquery_invisible_corpus_id, COQ_CORPUS_1.FileId AS coquery_invisible_origin_id FROM Corpus AS COQ_CORPUS_1 INNER JOIN Lexicon AS COQ_WORD_1 ON COQ_WORD_1.WordId = COQ_CORPUS_1.WordId WHERE (COQ_WORD_1.Word LIKE 'a%' OR COQ_WORD_1.Word LIKE 'b%') AND (COQ_WORD_1.POS LIKE 'n%')""" self.assertEqual(self.simple(query_string), self.simple(target_string))
def test_query_string_NULL_1(self): # tests issue #256 query = TokenQuery("_NULL *", self.Session) query_string = self.resource.get_query_string( query.query_list[0], ["word_label", "source_label"]) target_string = """ SELECT NULL AS coq_word_label_1, COQ_WORD_2.Word AS coq_word_label_2, COQ_SOURCE_2.Title AS coq_source_label_1, COQ_CORPUS_2.ID AS coquery_invisible_corpus_id, COQ_CORPUS_2.FileId AS coquery_invisible_origin_id FROM Corpus AS COQ_CORPUS_2 INNER JOIN Files AS COQ_SOURCE_2 ON COQ_SOURCE_2.FileId = COQ_CORPUS_2.FileId INNER JOIN Lexicon AS COQ_WORD_2 ON COQ_WORD_2.WordId = COQ_CORPUS_2.WordId""" self.assertEqual(self.simple(query_string), self.simple(target_string))
def test_get_required_columns_quantified(self): s = "more * than [dt]{0,1} [jj]{0,3} [nn*]{1,2}" query = TokenQuery(s, self.Session) self.assertTrue(len(query.query_list) == 16) l = self.resource.get_corpus_joins(query.query_list[0]) # 1 2 3 4 5 6 7 8 9 # more * than {NONE} {NONE NONE NONE} {[nn*] NONE} l = self.resource.get_required_columns(query.query_list[0], ["word_label"]) self.assertListEqual(l, ["COQ_WORD_1.Word AS coq_word_label_1", "COQ_WORD_2.Word AS coq_word_label_2", "COQ_WORD_3.Word AS coq_word_label_3", "NULL AS coq_word_label_4", "NULL AS coq_word_label_5", "NULL AS coq_word_label_6", "NULL AS coq_word_label_7", "COQ_WORD_8.Word AS coq_word_label_8", "NULL AS coq_word_label_9", "COQ_CORPUS_1.ID AS coquery_invisible_corpus_id", "COQ_CORPUS_1.FileId AS coquery_invisible_origin_id"])
def test_query_string_two_items(self): query = TokenQuery("a* b*", self.Session) query_string = self.resource.get_query_string(query.query_list[0], ["word_label"]) target_string = """ SELECT COQ_WORD_1.Word AS coq_word_label_1, COQ_WORD_2.Word AS coq_word_label_2, COQ_CORPUS_1.ID AS coquery_invisible_corpus_id, COQ_CORPUS_1.FileId AS coquery_invisible_origin_id FROM Corpus AS COQ_CORPUS_1 INNER JOIN Corpus AS COQ_CORPUS_2 ON COQ_CORPUS_2.ID = COQ_CORPUS_1.ID + 1 INNER JOIN Lexicon AS COQ_WORD_1 ON COQ_WORD_1.WordId = COQ_CORPUS_1.WordId INNER JOIN Lexicon AS COQ_WORD_2 ON COQ_WORD_2.WordId = COQ_CORPUS_2.WordId WHERE (COQ_WORD_1.Word LIKE 'a%') AND (COQ_WORD_2.Word LIKE 'b%')""" self.assertEqual(self.simple(query_string), self.simple(target_string))
def test_max_tokens_2(self): query = TokenQuery("item1{0,3} item2{1,3} item3{2,3}", self.session) self.assertEqual(query.get_max_tokens(), 9)
def test_corpus_joins_one_item(self): query = TokenQuery("*", self.Session) l = self.resource.get_corpus_joins(query.query_list[0]) self.assertListEqual(l, ["FROM Corpus AS COQ_CORPUS_1"])
def test_get_token_numbering_1(self): query = TokenQuery("item1 item2 item3", self.session) self.assertEqual(query.get_token_numbering(0), "1") self.assertEqual(query.get_token_numbering(1), "2") self.assertEqual(query.get_token_numbering(2), "3")
def test_lemmatized_corpus_joins_1(self): S = "#abc.[n*]" query = TokenQuery(S, self.Session) l = self.resource.get_corpus_joins(query.query_list[0]) self.assertListEqual(l, ["FROM Corpus AS COQ_CORPUS_1"])
def test_max_tokens_1(self): query = TokenQuery("item1 item2 item3", self.session) self.assertEqual(query.get_max_tokens(), 3)
def test_corpus_joins_three_items(self): query = TokenQuery("* * *", self.Session) l = self.resource.get_corpus_joins(query.query_list[0]) self.assertListEqual(l, ["FROM Corpus AS COQ_CORPUS_1", "INNER JOIN Corpus AS COQ_CORPUS_2 ON COQ_CORPUS_2.ID = COQ_CORPUS_1.ID + 1", "INNER JOIN Corpus AS COQ_CORPUS_3 ON COQ_CORPUS_3.ID = COQ_CORPUS_1.ID + 2"])