Exemple #1
0
    def test_e001(self):
        """ Test: Error code E001 """

        with pytest.raises(ValueError) as excinfo:
            excelcy = ExcelCy()
            excelcy.load(file_path='not_exist.xlsx')

        assert str(excinfo.value) == Errors.E001
Exemple #2
0
    def test_readme_04(self):
        """ Test: code snippet found in README.rst """

        # load first and confirm Himalayas is PRODUCT
        excelcy = ExcelCy.execute(file_path=self.get_test_data_path(
            fs_path='test_data_05.xlsx'))
        gold = excelcy.storage.train.items.get('1').items.get('1.1')
        assert gold.subtext == 'Himalayas' and gold.entity == 'PRODUCT'

        # retrain and set the entity of Himalaya to PLACE
        excelcy = ExcelCy.execute(file_path=self.get_test_data_path(
            fs_path='test_data_05a.xlsx'))
        gold = excelcy.storage.train.items.get('1').items.get('1.1')
        assert gold.subtext == 'Himalayas' and gold.entity == 'FAC'
Exemple #3
0
def prepare_excelcy_data():
    excelcy = ExcelCy()
    add_stopwords(excelcy.nlp)
    excelcy.storage.config = Config(nlp_base='en_core_web_lg',
                                    train_iteration=20,
                                    train_drop=0.2)
    excelcy.storage.base_path = str(constants.MODEL_DATA_DIR)
    excelcy.storage.source.add(kind='textract',
                               value='[base_path]/source/training_text.txt')
    excelcy.discover()
    excelcy.storage.prepare.add(kind='file',
                                value='[base_path]/prepare/pers.xlsx',
                                entity='')
    excelcy.storage.prepare.add(kind='file',
                                value='[base_path]/prepare/orgs.xlsx',
                                entity='')
    excelcy.storage.prepare.add(kind='file',
                                value='[base_path]/prepare/locs.xlsx',
                                entity='')
    excelcy.storage.prepare.add(kind='file',
                                value='[base_path]/prepare/ships.xlsx',
                                entity='')
    excelcy.storage.prepare.add(kind='file',
                                value='[base_path]/prepare/misc.xlsx',
                                entity='')
    excelcy.prepare()
    excelcy.storage.phase.add('discover')
    excelcy.storage.phase.add('prepare')
    excelcy.storage.phase.add('train')
    excelcy.storage.phase.add('retest')
    excelcy.storage.config.prepare_enabled = False
    excelcy.save_storage(str(constants.MODEL_DATA_DIR / 'train_model.xlsx'))
    def test_readme_02(self):
        """ Test: code snippet found in README.rst """

        excelcy = ExcelCy.execute(file_path=self.get_test_data_path(
            fs_path='test_data_01.xlsx'))
        doc = excelcy.nlp('Google rebrands its business apps')
        assert doc.ents[0].label_ == 'ORG'
    def test_save(self):
        """ Test: save training """

        excelcy = ExcelCy.execute(file_path=self.get_test_data_path(
            fs_path='test_data_01.xlsx'))
        file_path = self.get_test_tmp_path(fs_path='test_data_01.xlsx')
        excelcy.save(file_path=file_path)
        excelcy.load(file_path=file_path)
Exemple #6
0
 def assert_training(self, file_path: str, entity_tests: dict = None):
     excelcy = ExcelCy.execute(file_path=file_path)
     nlp = excelcy.nlp
     for idx, train in excelcy.storage.train.items.items():
         train_ents = set([(gold.subtext, gold.entity)
                           for _, gold in train.items.items()])
         doc = nlp(train.text)
         ents = set([(ent.text, ent.label_) for ent in doc.ents])
         for ent in ents:
             assert ent in train_ents
Exemple #7
0
    def test_readme_03(self):
        """ Test: code snippet found in README.rst """

        excelcy = ExcelCy()
        excelcy.storage.base_path = self.test_data_path
        excelcy.storage.config = Config(nlp_base='en_core_web_sm',
                                        train_iteration=2,
                                        train_drop=0.2)
        excelcy.storage.source.add(kind='textract',
                                   value='source/source_01.txt')
        excelcy.storage.prepare.add(kind='phrase', value='Uber', entity='ORG')
        excelcy.discover()
        excelcy.prepare()
        excelcy.train()
        assert excelcy.nlp(
            'Uber blew through $1 million a week').ents[0].label_ == 'ORG'
Exemple #8
0
    def test_save(self):
        """ Test: save training """

        excelcy = ExcelCy.execute(file_path=self.get_test_data_path(fs_path='test_data_01.xlsx'))
        file_path = self.get_test_tmp_path(fs_path='test_data_01.xlsx')
        excelcy.save_storage(file_path=file_path)

        data = self.extract_storage(storage=excelcy.storage)
        excelcy.load(file_path=file_path)
        data2 = self.extract_storage(storage=excelcy.storage)

        assert data == data2
    def test_matcher(self):
        """ Test: Matcher """

        excelcy = ExcelCy()
        excelcy.storage.config = Config(nlp_base='en_core_web_sm',
                                        train_iteration=2,
                                        train_drop=0.2)
        nlp = excelcy.create_nlp()
        patterns = [{
            'kind': 'phrase',
            'value': 'thisisrandom',
            'entity': 'PRODUCT'
        }, {
            'kind': 'regex',
            'value': 'thatis(.+)',
            'entity': 'PRODUCT'
        }]
        nlp.add_pipe(MatcherPipe(nlp=nlp,
                                 patterns=patterns))  # type: MatcherPipe
        doc = nlp('thisisrandom thatisrandom')
        assert doc.ents[0].label_ == 'PRODUCT' and doc.ents[
            1].label_ == 'PRODUCT'
 def assert_training(self, file_path: str, entity_tests: dict = None):
     excelcy = ExcelCy.execute(file_path=file_path)
     nlp = excelcy.nlp
     for idx, train in excelcy.storage.train.items.items():
         train_ents = set([(gold.subtext, gold.entity)
                           for _, gold in train.items.items()])
         doc = nlp(train.text)
         ents = set([(ent.text, ent.label_) for ent in doc.ents])
         # verify based on data
         assert train_ents <= ents
         # verify if test given
         test = (entity_tests or {}).get(idx, set())
         assert test <= ents
Exemple #11
0
def train_excelcy(save=False):
    excelcy = ExcelCy()
    add_stopwords(excelcy.nlp)
    excelcy.execute(str(constants.MODEL_DATA_DIR / 'train_model.xlsx'))
    if save:
        excelcy.save_nlp(str(constants.MODEL_DIR))

    doc = excelcy.nlp(load_book_by_nr(1).content())
    ships = set([
        re.sub('[tT]he ', '', ent.text) for ent in doc.ents
        if ent.label_ == 'SHIP'
    ])
    persons = set([ent.text for ent in doc.ents if ent.label_ == 'PERSON'])
    print(ships)
    print(persons)
Exemple #12
0
from excelcy import ExcelCy
from excelcy.storage import Config

# test_string = 'Android Pay expands to Canada'
# excelcy = ExcelCy()
# excelcy.storage.config = Config(nlp_base='en_core_web_sm', train_iteration=50, train_drop=0.2)
# doc = excelcy.nlp(test_string)
# # showing no ORG
# print([(ent.label_, ent.text) for ent in doc.ents])
# excelcy.storage.source.add(kind='text', value=test_string)
# excelcy.discover()
# excelcy.storage.prepare.add(kind='phrase', value='Android Pay', entity='PRODUCT')
# excelcy.prepare()
# excelcy.train()
# doc = excelcy.nlp(test_string)
# print([(ent.label_, ent.text) for ent in doc.ents])

# FAILED tests/test_excelcy.py::ExcelCyTestCase::test_execute - AssertionError: assert ('$1', 'MONEY') in {('$1 million', 'MONEY'), ('Uber', 'ORG')}
# FAILED tests/test_pipe.py::PipeTestCase::test_execute - AssertionError: assert ('$1', 'MONEY') in {('$1 million', 'MONEY'), ('Uber', 'ORG')}
# FAILED tests/test_readme.py::ReadmeTestCase::test_readme_04 - AssertionError: assert ('China' == 'Himalayas'

excelcy = ExcelCy()
doc = excelcy.nlp('Android Pay expands to Canada')
print([(ent.label_, ent.text) for ent in doc.ents])
excelcy = ExcelCy.execute(file_path='tests/data/test_data_03.xlsx')
doc = excelcy.nlp('Android Pay expands to Canada')
print([(ent.label_, ent.text) for ent in doc.ents])
Exemple #13
0
def main(argv: list = None):
    # quick CLI execution
    args = argv or sys.argv
    if args[1] == 'execute':
        excelcy = ExcelCy.execute(file_path=args[2])