def test_readme_04(self): """ Test: code snippet found in README.rst """ # load first and confirm Himalayas is PRODUCT excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_05.xlsx')) gold = excelcy.storage.train.items.get('1').items.get('1.1') assert gold.subtext == 'Himalayas' and gold.entity == 'PRODUCT' # retrain and set the entity of Himalaya to PLACE excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_05a.xlsx')) gold = excelcy.storage.train.items.get('1').items.get('1.1') assert gold.subtext == 'Himalayas' and gold.entity == 'FAC'
def train_excelcy(save=False): excelcy = ExcelCy() add_stopwords(excelcy.nlp) excelcy.execute(str(constants.MODEL_DATA_DIR / 'train_model.xlsx')) if save: excelcy.save_nlp(str(constants.MODEL_DIR)) doc = excelcy.nlp(load_book_by_nr(1).content()) ships = set([ re.sub('[tT]he ', '', ent.text) for ent in doc.ents if ent.label_ == 'SHIP' ]) persons = set([ent.text for ent in doc.ents if ent.label_ == 'PERSON']) print(ships) print(persons)
def test_readme_02(self): """ Test: code snippet found in README.rst """ excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_01.xlsx')) doc = excelcy.nlp('Google rebrands its business apps') assert doc.ents[0].label_ == 'ORG'
def test_save(self): """ Test: save training """ excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_01.xlsx')) file_path = self.get_test_tmp_path(fs_path='test_data_01.xlsx') excelcy.save(file_path=file_path) excelcy.load(file_path=file_path)
def assert_training(self, file_path: str, entity_tests: dict = None): excelcy = ExcelCy.execute(file_path=file_path) nlp = excelcy.nlp for idx, train in excelcy.storage.train.items.items(): train_ents = set([(gold.subtext, gold.entity) for _, gold in train.items.items()]) doc = nlp(train.text) ents = set([(ent.text, ent.label_) for ent in doc.ents]) for ent in ents: assert ent in train_ents
def test_save(self): """ Test: save training """ excelcy = ExcelCy.execute(file_path=self.get_test_data_path(fs_path='test_data_01.xlsx')) file_path = self.get_test_tmp_path(fs_path='test_data_01.xlsx') excelcy.save_storage(file_path=file_path) data = self.extract_storage(storage=excelcy.storage) excelcy.load(file_path=file_path) data2 = self.extract_storage(storage=excelcy.storage) assert data == data2
def assert_training(self, file_path: str, entity_tests: dict = None): excelcy = ExcelCy.execute(file_path=file_path) nlp = excelcy.nlp for idx, train in excelcy.storage.train.items.items(): train_ents = set([(gold.subtext, gold.entity) for _, gold in train.items.items()]) doc = nlp(train.text) ents = set([(ent.text, ent.label_) for ent in doc.ents]) # verify based on data assert train_ents <= ents # verify if test given test = (entity_tests or {}).get(idx, set()) assert test <= ents
from excelcy import ExcelCy from excelcy.storage import Config # test_string = 'Android Pay expands to Canada' # excelcy = ExcelCy() # excelcy.storage.config = Config(nlp_base='en_core_web_sm', train_iteration=50, train_drop=0.2) # doc = excelcy.nlp(test_string) # # showing no ORG # print([(ent.label_, ent.text) for ent in doc.ents]) # excelcy.storage.source.add(kind='text', value=test_string) # excelcy.discover() # excelcy.storage.prepare.add(kind='phrase', value='Android Pay', entity='PRODUCT') # excelcy.prepare() # excelcy.train() # doc = excelcy.nlp(test_string) # print([(ent.label_, ent.text) for ent in doc.ents]) # FAILED tests/test_excelcy.py::ExcelCyTestCase::test_execute - AssertionError: assert ('$1', 'MONEY') in {('$1 million', 'MONEY'), ('Uber', 'ORG')} # FAILED tests/test_pipe.py::PipeTestCase::test_execute - AssertionError: assert ('$1', 'MONEY') in {('$1 million', 'MONEY'), ('Uber', 'ORG')} # FAILED tests/test_readme.py::ReadmeTestCase::test_readme_04 - AssertionError: assert ('China' == 'Himalayas' excelcy = ExcelCy() doc = excelcy.nlp('Android Pay expands to Canada') print([(ent.label_, ent.text) for ent in doc.ents]) excelcy = ExcelCy.execute(file_path='tests/data/test_data_03.xlsx') doc = excelcy.nlp('Android Pay expands to Canada') print([(ent.label_, ent.text) for ent in doc.ents])
def main(argv: list = None): # quick CLI execution args = argv or sys.argv if args[1] == 'execute': excelcy = ExcelCy.execute(file_path=args[2])