def _load_xlsx(self, file_path: str): """ Data loader for XLSX, this needs to be converted back to YML structure format :param file_path: XLSX file path """ wb = utils.excel_load(file_path=file_path) data = odict() # TODO: add validator, if wrong data input # TODO: refactor to less hardcoded? # parse phase data['phase'] = odict() data['phase']['items'] = odict() for phase in wb.get('phase', []): idx = phase.get('idx', len(data['phase']['items'])) args = odict() raws = phase.get('args', '').split(',') for raw in raws: kv = raw.split('=') if len(kv) == 2: key, value = kv args[key.strip()] = value.strip() phase['args'] = args data['phase']['items'][str(idx)] = phase # parse source data['source'] = odict() data['source']['items'] = odict() for source in wb.get('source', []): idx = source.get('idx', len(data['source']['items'])) data['source']['items'][str(idx)] = source # parse prepare data['prepare'] = odict() data['prepare']['items'] = odict() for prepare in wb.get('prepare', []): idx = prepare.get('idx', len(data['prepare']['items'])) data['prepare']['items'][str(idx)] = prepare # parse train data['train'] = odict() data['train']['items'] = odict() # lets ensure there is idx train_idx, gold_idx = 0, 0 for train in wb.get('train', []): if train.get('text') is not None: if gold_idx > 0: train_idx = train_idx + 1 gold_idx = 0 if train.get('idx') is None: train['idx'] = str(train_idx) else: if train.get('idx') is None: train['idx'] = '%s.%s' % (train_idx, gold_idx) gold_idx = gold_idx + 1 for train in wb.get('train', []): idx = str(train.get('idx')) train_idx, gold_idx = idx, None if '.' in idx: train_idx, gold_idx = idx.split('.') # add train list if train.get('text') is not None: t = odict() t['items'] = odict() for k in ['idx', 'text']: t[k] = train.get(k) data['train']['items'][train_idx] = t else: t = data['train']['items'][train_idx] g = odict() for k in ['idx', 'subtext', 'offset', 'entity']: g[k] = train.get(k) t['items'][idx] = g # parse config data['config'] = odict() for config in wb.get('config', odict()): name, value = config.get('name'), config.get('value') data['config'][name] = value self.parse(data=data)
def _prepare_init_file(self, prepare: Prepare): wb = utils.excel_load(file_path=self.resolve_path(prepare.value)) for item in wb.get('prepare', []): prepare = Prepare.make(items=item) self._prepare_init_base(prepare=prepare)