Ejemplo n.º 1
0
    def test_core(self):
        data_type = namedtuple('data_type', ['doc_id', 'field1', 'field2'])
        mock_file = StringFile('''
123\tsome field\tanother field
123\t  repeated  entry \tshouldn't filter

456\tanother query\tsomething
'''.lstrip())
        expected_results = [
            data_type('123', 'some field', 'another field'),
            data_type('123', '  repeated  entry ', 'shouldn\'t filter'),
            data_type('456', 'another query', 'something'),
        ]

        queries = TsvQueries(mock_file, data_type)
        self.assertEqual(queries.queries_path(), 'MOCK')
        self.assertEqual(list(queries.queries_iter()), expected_results)

        docs = TsvDocs(mock_file, data_type)
        self.assertEqual(docs.docs_path(), 'MOCK')
        self.assertEqual(list(docs.docs_iter()), expected_results)

        docpairs = TsvDocPairs(mock_file, data_type)
        self.assertEqual(docpairs.docpairs_path(), 'MOCK')
        self.assertEqual(list(docpairs.docpairs_iter()), expected_results)
Ejemplo n.º 2
0
    def test_flex_columns(self):
        class data_type(NamedTuple):
            doc_id: str
            field1: str
            field2: Tuple[str, ...]

        mock_file = StringFile('''
123\tsome field\tanother field
123\ttoo few fields

456\tanother query\tsomething
456\tanother query\tsomething\ttoo many fields\teven more
'''.strip())

        expected_results = [
            data_type('123', 'some field', ('another field', )),
            data_type('123', 'too few fields', ()),
            data_type('456', 'another query', ('something', )),
            data_type('456', 'another query',
                      ('something', 'too many fields', 'even more')),
        ]

        queries = TsvQueries(mock_file, data_type)
        self.assertEqual(queries.queries_path(), 'MOCK')
        self.assertEqual(list(queries.queries_iter()), expected_results)

        docs = TsvDocs(mock_file, data_type)
        self.assertEqual(docs.docs_path(), 'MOCK')
        self.assertEqual(list(docs.docs_iter()), expected_results)

        docpairs = TsvDocPairs(mock_file, data_type)
        self.assertEqual(docpairs.docpairs_path(), 'MOCK')
        self.assertEqual(list(docpairs.docpairs_iter()), expected_results)
Ejemplo n.º 3
0
    def test_docs(self):
        mock_file = StringFile('''
<DOC>
<DOCNO>  D100A   </DOCNO>
<PARENT> Something </PARENT>
<HT> Some text  </HT>

<HEADLINE>
<AU>   Header Text </AU>
Daily Report 

</HEADLINE>

<TEXT>
Main body text
on multiple lines

with <F P=102> some markup
</F> here. Also, some invalid <T> markup &amp;. 
</TEXT>

</DOC>

<DOC>
<DOCNO>  101   </DOCNO>

<TEXT>
More body text
</TEXT>

</DOC>
'''.lstrip())
        expected_results = [
            TrecDoc(
                doc_id='D100A',
                text=
                '\n\n   Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith  some markup\n here. Also, some invalid  markup &. \n\n',
                marked_up_doc=
                '<HEADLINE>\n<AU>   Header Text </AU>\nDaily Report \n\n</HEADLINE>\n<TEXT>\nMain body text\non multiple lines\n\nwith <F P=102> some markup\n</F> here. Also, some invalid <T> markup &amp;. \n</TEXT>\n'
            ),
            TrecDoc(doc_id='101',
                    text='\n\nMore body text\n\n',
                    marked_up_doc='<TEXT>\nMore body text\n</TEXT>\n'),
        ]

        docs = TrecDocs(mock_file)
        self.assertEqual(docs.docs_path(), 'MOCK')
        self.assertEqual(list(docs.docs_iter()), expected_results)
Ejemplo n.º 4
0
    def test_qrels_bad_line(self):
        mock_file = StringFile('''
Q0 0 D1 3
Q0 1 D2   2

Q0 0\tD3 3
Q0 1 D2 1
BAD LINE
Q1 0 D2 1
'''.lstrip())

        QREL_DEFS = {}

        qrels = TrecQrels(mock_file, QREL_DEFS)
        with self.assertRaises(RuntimeError):
            list(qrels.qrels_iter())
Ejemplo n.º 5
0
    def test_queries(self):
        mock_file = StringFile('''
<top>

<num> Number: Q100A 
<title>    Topic: Some title

<desc>  Description:  
Descriptive text
split on multiple lines

<narr> Narrative: 
Further elaboration of the query intent
split on multiple lines

</top>

<top>

<num> 102 
<title> Query 2

<desc>
Q2 description

<narr> Narrative: 
Q2 narrative

</top>
'''.lstrip())
        expected_results = [
            TrecQuery(
                'Q100A', 'Some title',
                "Descriptive text\nsplit on multiple lines",
                'Further elaboration of the query intent\nsplit on multiple lines'
            ),
            TrecQuery('102', 'Query 2', "Q2 description", 'Q2 narrative'),
        ]

        queries = TrecQueries(mock_file)
        self.assertEqual(queries.queries_path(), 'MOCK')
        self.assertEqual(list(queries.queries_iter()), expected_results)
Ejemplo n.º 6
0
    def test_too_few_columns(self):
        data_type = namedtuple('data_type', ['doc_id', 'field1', 'field2'])
        mock_file = StringFile('''
123\tsome field\tanother field
123\ttoo few fields

456\tanother query\tsomething
'''.strip())

        queries = TsvQueries(mock_file, data_type)
        with self.assertRaises(RuntimeError):
            list(queries.queries_iter())

        docs = TsvDocs(mock_file, data_type)
        with self.assertRaises(RuntimeError):
            list(docs.docs_iter())

        docpairs = TsvDocPairs(mock_file, data_type)
        with self.assertRaises(RuntimeError):
            list(docpairs.docpairs_iter())
Ejemplo n.º 7
0
    def test_qrels(self):
        mock_file = StringFile('''
Q0 0 D1 3
Q0 1 D2   2

Q0 0\tD3 3
Q0 1 D2 1
Q1 0 D2 1
'''.lstrip())
        QREL_DEFS = {}
        expected_results = [
            TrecQrel('Q0', 'D1', 3, '0'),
            TrecQrel('Q0', 'D2', 2, '1'),
            TrecQrel('Q0', 'D3', 3, '0'),
            TrecQrel('Q0', 'D2', 1, '1'),
            TrecQrel('Q1', 'D2', 1, '0'),
        ]

        qrels = TrecQrels(mock_file, QREL_DEFS)
        self.assertEqual(qrels.qrels_path(), 'MOCK')
        self.assertEqual(qrels.qrels_defs(), QREL_DEFS)
        self.assertEqual(list(qrels.qrels_iter()), expected_results)