Ejemplo n.º 1
0
    def load_data(
            cls,
            subset_name: str = 'train',
            task_name: str = 'ner',
            shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        """

        """
        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=k.DATA_PATH,
                               untar=True)

        if subset_name not in {'train', 'test', 'valid'}:
            raise ValueError()

        file_path = os.path.join(corpus_path, f'{subset_name}.txt')

        if task_name not in {'pos', 'chunking', 'ner'}:
            raise ValueError()

        data_index = ['pos', 'chunking', 'ner'].index(task_name) + 1

        x_data, y_data = DataReader.read_conll_format_file(
            file_path, label_index=data_index)
        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logging.debug(
            f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
            f"x[0]: {x_data[0]}\n"
            f"y[0]: {y_data[0]}")
        return x_data, y_data
Ejemplo n.º 2
0
    def load_data(
            cls,
            subset_name: str = 'train',
            shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        """
        Load dataset as sequence labeling format, char level tokenized

        Args:
            subset_name: {train, test, valid}
            shuffle: should shuffle or not, default True.

        Returns:
            dataset_features and dataset labels
        """
        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=K.DATA_PATH,
                               untar=True)

        if subset_name == 'train':
            file_path = os.path.join(corpus_path, 'example.train')
        elif subset_name == 'test':
            file_path = os.path.join(corpus_path, 'example.test')
        else:
            file_path = os.path.join(corpus_path, 'example.dev')

        x_data, y_data = DataReader.read_conll_format_file(file_path)
        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logger.debug(
            f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
            f"x[0]: {x_data[0]}\n"
            f"y[0]: {y_data[0]}")
        return x_data, y_data
Ejemplo n.º 3
0
    def load_data(
            self,
            subset_name: str = 'train',
            shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        """
        Load dataset as sequence labeling format, char level tokenized

        Args:
           subset_name: {train, test, valid}
           shuffle: should shuffle or not, default True.

        Returns:
           dataset_features and dataset labels
        """

        df = pd.read_csv(self.file_path)
        df = df[:self.sample_count]
        df['y'] = df.apply(self._extract_label, axis=1)
        df['x'] = df['comment_text'].apply(self._text_process)
        df = df[['x', 'y']]
        if subset_name == 'train':
            df = df.loc[self.train_ids]
        elif subset_name == 'valid':
            df = df.loc[self.valid_ids]
        else:
            df = df.loc[self.test_ids]

        xs, ys = list(df['x'].values), list(df['y'].values)
        if shuffle:
            xs, ys = utils.unison_shuffled_copies(xs, ys)
        return xs, ys
Ejemplo n.º 4
0
    def test_unison_shuffled_copies(self):
        x: np.ndarray = np.random.randint(0, 10, size=(100, 5))
        y: np.ndarray = np.random.randint(0, 10, size=(100, ))

        new_x, new_y = unison_shuffled_copies(x, y)
        assert new_x.shape == x.shape
        assert new_y.shape == y.shape
Ejemplo n.º 5
0
    def load_data(cls,
                  subset_name: str = 'train',
                  shuffle: bool = True,
                  cutter: str = 'char') -> Tuple[List[List[str]], List[str]]:
        """
        Load dataset as sequence classification format, char level tokenized

        features: ``[['听', '新', '闻', '。'], ['电', '视', '台', '在', '播', '什', '么'], ...]``

        labels: ``['news', 'epg', ...]``

        Samples::
            train_x, train_y = SMP2018ECDTCorpus.load_data('train')
            test_x, test_y = SMP2018ECDTCorpus.load_data('test')

        Args:
            subset_name: {train, test, valid}
            shuffle: should shuffle or not, default True.
            cutter: sentence cutter, {char, jieba}

        Returns:
            dataset_features and dataset labels
        """

        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=k.DATA_PATH,
                               untar=True)

        if cutter not in ['char', 'jieba', 'none']:
            raise ValueError(
                'cutter error, please use one onf the {char, jieba}')

        df_path = os.path.join(corpus_path, f'{subset_name}.csv')
        df = pd.read_csv(df_path)
        if cutter == 'jieba':
            try:
                import jieba
            except ModuleNotFoundError:
                raise ModuleNotFoundError(
                    "please install jieba, `$ pip install jieba`")
            x_data = [list(jieba.cut(item)) for item in df['query'].to_list()]
        elif 'char':
            x_data = [list(item) for item in df['query'].to_list()]
        y_data = df['label'].to_list()

        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logging.debug(f"loaded {len(x_data)} samples from {df_path}. Sample:\n"
                      f"x[0]: {x_data[0]}\n"
                      f"y[0]: {y_data[0]}")
        return x_data, y_data
Ejemplo n.º 6
0
    def load_data(
            cls,
            subset_name: str = 'train',
            shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        """
        Load dataset as sequence labeling format, char level tokenized

        features: ``[['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', ...], ...]``

        labels: ``[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', ...], ...]``

        Sample::

            train_x, train_y = ChineseDailyNerCorpus.load_data('train')
            test_x, test_y = ChineseDailyNerCorpus.load_data('test')

        Args:
            subset_name: {train, test, valid}
            shuffle: should shuffle or not, default True.

        Returns:
            dataset_features and dataset labels
        """
        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=k.DATA_PATH,
                               untar=True)

        if subset_name == 'train':
            file_path = os.path.join(corpus_path, 'example.train')
        elif subset_name == 'test':
            file_path = os.path.join(corpus_path, 'example.test')
        else:
            file_path = os.path.join(corpus_path, 'example.dev')

        x_data, y_data = DataReader.read_conll_format_file(file_path)
        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logging.debug(
            f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
            f"x[0]: {x_data[0]}\n"
            f"y[0]: {y_data[0]}")
        return x_data, y_data
Ejemplo n.º 7
0
def load_data(subset_name='train', shuffle=True):
    """
    Load dataset as sequence labeling format, char level tokenized

    Args:
        subset_name: {train, test, valid}
        shuffle: should shuffle or not, default True.

    Returns:
        dataset_features and dataset labels
    """

    if subset_name == 'train':
        file_path = '../../data/ChineseDailyNerCorpus/example.train'
    elif subset_name == 'test':
        file_path = '../../data/ChineseDailyNerCorpus/example.test'
    else:
        file_path = '../../data/ChineseDailyNerCorpus/example.dev'

    x_data, y_data = DataReader.read_conll_format_file(file_path)
    if shuffle:
        x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)

    return x_data, y_data
Ejemplo n.º 8
0
训练包含:ORG、LOC、PER、TIME的中文NER任务模型
'''

import kashgari
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari import utils

kashgari.config.use_cudnn_cell = False

train_x, train_y = DataReader().read_conll_format_file('data/data_all/example.train')
valid_x, valid_y = DataReader().read_conll_format_file('data/data_all/example.dev')
test_x, test_y = DataReader().read_conll_format_file('data/data_all/example.test')

train_x, train_y = utils.unison_shuffled_copies(train_x, train_y)
valid_x, valid_y = utils.unison_shuffled_copies(valid_x, valid_y)
test_x, test_y = utils.unison_shuffled_copies(test_x, test_y)

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}", test_x[0], test_y[0])

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=100)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=512, epochs=20)

model.save('models/all_ner.h5')