class NTCIRConverterTest(unittest.TestCase): source_dir: Path = data_dir.joinpath('ntcir') def __init__(self, *args, **kwargs): super(NTCIRConverterTest, self).__init__(*args, **kwargs) self.converter: NTCIRConverter = NTCIRConverter() self.test_file: Path = NTCIRConverterTest.source_dir.joinpath( 'orig/collection/sample.txt') self.docs: List[ColDocument] = list( self.converter.to_document(self.test_file)) self.queries: List[QueryDocument] = [ self.converter.to_query_dump( NTCIRConverterTest.source_dir.joinpath( f'orig/query/100{i}.xml'))[0] for i in range(1, 4) ] def test_get_title(self): assert self.docs[ 0].title.value == 'Process for making improved corrosion preventive zinc cyanamide' def test_get_docid(self): assert self.docs[0].docid.value == '199305176894' def test_get_tags(self): self.assertListEqual(self.docs[0].tags.value, ['C01C']) def test_get_text(self): assert self.docs[0].text.value.split( )[:3] == 'The invention will'.split() def test_get_paras(self): assert self.queries[0].paras[0].split( )[:3] == 'DETAILED DESCRIPTION OF'.split() assert self.queries[0].paras[1].split( )[:3] == 'On the contrary,'.split()
def dump_query(self) -> None: qlist: List[QueryDocument] = sum([ list(self.dataset.converter.to_query_dump(fpath)) for fpath in self.dataset.iter_query_files() ], []) dic: Dict = QueryDataset(name=self.name, queries=qlist).to_dict() fpath: Path = data_dir.joinpath(f'{self.name}/query/dump.json') with open(fpath, 'w') as fout: json.dump(dic, fout)
def to_xml_root(self, docid: str) -> ET.Element: """ Parameters ---------- docid EP... *with* '-' e.g. EP-0050001-A2 """ first: str = docid[3] second: str = docid[4:6] third: str = docid[6:8] forth: str = docid[8:10] fpath: Path = data_dir.joinpath( f'clef/orig/collection/00000{first}/{second}/{third}/{forth}/{docid}.xml' ) root: ET.Element = ET.parse(str(fpath.resolve())).getroot() return root
def __init__(self, *args, **kwargs): super(AANConverterTest, self).__init__(*args, **kwargs) self.converter: AANConverter = AANConverter() self.source_dir: Path = data_dir.joinpath('aan/orig/collection') self.docs: List[ColDocument] = list( flatten([ self.converter.to_document( self.source_dir.joinpath(f'{docid}.txt')) for docid in [ 'D07-1026', 'D07-1016', ] ])) self.queries: List[QueryDocument] = list( flatten([ self.converter.to_query_dump( self.source_dir.joinpath(f'{docid}.txt')) for docid in [ 'D07-1026', 'D07-1016', ] ]))
def iter_orig_files(self) -> Generator[Path, None, None]: return data_dir.joinpath(f'aan/orig/collection').glob(f'*.txt')
dataset: str) -> bool: body: Dict = { 'query': { 'match': { 'docid': docid } } } res = es.search(index=dataset, body=body) return len(res['hits']['hits']) > 0 if __name__ == '__main__': args = parser.parse_args() dataset: str = args.dataset[0] is_map: bool = args.mapping with open(data_dir.joinpath(f'{dataset}/en.qrel')) as fin: lines: List[str] = fin.read().splitlines() if is_map: with open(data_dir.joinpath(f'{dataset}/name_mapping.json')) as fin: mapping: Dict[str, str] = json.load(fin) with open(data_dir.joinpath(f'{dataset}/en.valid.qrel'), 'w') as fout: for line in lines: items: List[str] = line.split() docid: str = items[2].replace('-', '') if check_existence(docid, dataset): query_docid: str = mapping[items[0]].replace('-', '') if is_map else items[0] fout.write(f'{query_docid} {items[1]} {docid} {items[3]}\n')
from dataclasses import dataclass import logging from pathlib import Path import sys from typing import Dict, List, Tuple import xml.etree.ElementTree as ET import nltk from docsim.elas import models from docsim.converters.base import (Converter) from docsim.models import ColDocument, ColParagraph, QueryDocument from docsim.settings import data_dir logger = logging.getLogger(__file__) with open(data_dir.joinpath('aan/orig/citations.txt'), 'r') as fin: title_dic: Dict[str, str] = { line[0]: line[1] for line in [l.split('\t') for l in fin.read().splitlines()] } @dataclass class AANConverter(Converter): def _get_paragraph_list(self, root: ET.Element) -> List[str]: pass def _get_info(self, fpath: Path) -> Tuple[str, List[str], str, str]: docid: str = fpath.stem tags: List[str] = [ docid.split('-')[0],
def dump_path_from_name(cls, name: str) -> Path: return data_dir.joinpath(f'{name}.faiss')
def iter_query_files(self) -> Generator[Path, None, None]: return data_dir.joinpath(f'ntcir/orig/query').glob(f'**/*')
def mapping_fpath(self) -> Path: return data_dir.joinpath(f'ntcir/name_mapping.json')
def iter_orig_files(self) -> Generator[Path, None, None]: return data_dir.joinpath(f'clef/orig/collection').glob(f'**/*.xml')
def _get_dump_path(cls, name: str) -> Path: return data_dir.joinpath(f'{name}/query/dump.json')