Exemple #1
0
def post_process(pred, amr_version):
    pred = os.path.realpath(pred)
    utils_tar_gz = get_amr_utils(amr_version)
    util_dir = get_resource(utils_tar_gz)
    stog_home = get_resource(
        'https://github.com/jcyk/AMR-gs/archive/master.zip')
    with pushd(stog_home):
        run_cmd(
            f'python3 -u -m stog.data.dataset_readers.amr_parsing.postprocess.postprocess '
            f'--amr_path {pred} --util_dir {util_dir} --v 2')
    return pred + '.post'
Exemple #2
0
def convert_jsonlines_to_IOBES(json_file,
                               output_file=None,
                               doc_level_offset=True):
    json_file = get_resource(json_file)
    if not output_file:
        output_file = os.path.splitext(json_file)[0] + '.ner.tsv'
    with open(json_file) as src, open(output_file, 'w',
                                      encoding='utf-8') as out:
        for line in src:
            doc = json.loads(line)
            offset = 0
            for sent, ner in zip(doc['sentences'], doc['ner']):
                tags = ['O'] * len(sent)
                for start, end, label in ner:
                    if doc_level_offset:
                        start -= offset
                        end -= offset
                    if start == end:
                        tags[start] = 'S-' + label
                    else:
                        tags[start] = 'B-' + label
                        for i in range(start + 1, end + 1):
                            tags[i] = 'I-' + label
                        tags[end] = 'E-' + label
                offset += len(sent)
                for token, tag in zip(sent, tags):
                    out.write(f'{token}\t{tag}\n')
                out.write('\n')
Exemple #3
0
 def load_language_model(cls, model_file):
     model_file = get_resource(model_file)
     state = torch.load(model_file)
     model = RNNLanguageModel(state['n_tokens'], state['is_forward_lm'],
                              state['hidden_size'], state['embedding_size'])
     model.load_state_dict(state['state_dict'], strict=False)
     return model
Exemple #4
0
def evaluate(gold_file, pred_file):
    """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski)

    Args:
      gold_file(str): The gold conllx file
      pred_file(str): The pred conllx file

    Returns:

    
    """
    gold_file = get_resource(gold_file)
    fixed_pred_file = tempfile.NamedTemporaryFile().name
    copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False)
    if gold_file.endswith('.conllu'):
        fixed_gold_file = tempfile.NamedTemporaryFile().name
        copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False)
        gold_file = fixed_gold_file

    exitcode, out, err = get_exitcode_stdout_stderr(
        f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}')
    if exitcode:
        raise RuntimeError(
            f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.'
        )
    lines = out.split('\n')[-4:]
    las = int(lines[0].split()[3]) / int(lines[0].split()[5])
    uas = int(lines[1].split()[3]) / int(lines[1].split()[5])
    return uas, las
Exemple #5
0
 def load_config(self, save_dir, filename='config.json', **kwargs):
     save_dir = get_resource(save_dir)
     self.config.load_json(os.path.join(save_dir, filename))
     self.config.update(kwargs)  # overwrite config loaded from disk
     for k, v in self.config.items():
         if isinstance(v, dict) and 'classpath' in v:
             self.config[k] = Configurable.from_config(v)
     self.on_config_ready(**self.config)
Exemple #6
0
def make_gold_conll(ontonotes_path, language):
    ontonotes_path = os.path.abspath(get_resource(ontonotes_path))
    to_conll = get_resource(
        'https://gist.githubusercontent.com/hankcs/46b9137016c769e4b6137104daf43a92/raw/66369de6c24b5ec47696ae307591f0d72c6f3f02/ontonotes_to_conll.sh'
    )
    to_conll = os.path.abspath(to_conll)
    # shutil.rmtree(os.path.join(ontonotes_path, 'conll-2012'), ignore_errors=True)
    with pushd(ontonotes_path):
        try:
            flash(
                f'Converting [blue]{language}[/blue] to CoNLL format, '
                f'this might take half an hour [blink][yellow]...[/yellow][/blink]'
            )
            run_cmd(f'bash {to_conll} {ontonotes_path} {language}')
            flash('')
        except RuntimeError as e:
            flash(
                f'[red]Failed[/red] to convert {language} of {ontonotes_path} to CoNLL. See exceptions for detail'
            )
            raise e
Exemple #7
0
 def __init__(self, field: str, path: str, trainable=False) -> None:
     super().__init__()
     self.field = field
     path = get_resource(path)
     f = os.path.join(path, 'forward.pt')
     b = os.path.join(path, 'backward.pt')
     self.f: RNNLanguageModel = RNNLanguageModel.load_language_model(f)
     self.b: RNNLanguageModel = RNNLanguageModel.load_language_model(b)
     if not trainable:
         for p in self.parameters():
             p.requires_grad_(False)
Exemple #8
0
 def load_data(self, data, generate_idx=False):
     if self.should_load_file(data):
         if isinstance(data, str):
             data = get_resource(data)
         data = list(self.load_file(data))
     if generate_idx:
         for i, each in enumerate(data):
             each[IDX] = i
     # elif isinstance(data, list):
     #     data = self.load_list(data)
     return data
Exemple #9
0
 def __init__(self, filepath: str, src, dst=None, **kwargs) -> None:
     if not dst:
         dst = src + '_fasttext'
     self.filepath = filepath
     flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]')
     filepath = get_resource(filepath)
     with stdout_redirected(to=os.devnull, stdout=sys.stderr):
         self._model = fasttext.load_model(filepath)
     flash('')
     output_dim = self._model['king'].size
     super().__init__(output_dim, src, dst)
Exemple #10
0
def smatch_eval(pred, gold, use_fast=False) -> Union[SmatchScores, F1_]:
    script = get_resource(_FAST_SMATCH_SCRIPT if use_fast else _SMATCH_SCRIPT)
    home = os.path.dirname(script)
    pred = os.path.realpath(pred)
    gold = os.path.realpath(gold)
    with pushd(home):
        flash('Running evaluation script [blink][yellow]...[/yellow][/blink]')
        cmd = f'bash {script} {pred} {gold}'
        text = run_cmd(cmd)
        flash('')
    return format_fast_scores(text) if use_fast else format_official_scores(
        text)
Exemple #11
0
 def __init__(self,
              mapper: Union[str, dict],
              src: str,
              dst: str = None) -> None:
     super().__init__(src, dst)
     self.mapper = mapper
     if isinstance(mapper, str):
         mapper = get_resource(mapper)
     if isinstance(mapper, str):
         self._table = load_json(mapper)
     elif isinstance(mapper, dict):
         self._table = mapper
     else:
         raise ValueError(f'Unrecognized mapper type {mapper}')
Exemple #12
0
def read_conll(filepath: Union[str, TimingFileIterator],
               underline_to_none=False,
               enhanced_collapse_empty_nodes=False):
    sent = []
    if isinstance(filepath, str):
        filepath: str = get_resource(filepath)
        if filepath.endswith(
                '.conllu') and enhanced_collapse_empty_nodes is None:
            enhanced_collapse_empty_nodes = True
        src = open(filepath, encoding='utf-8')
    else:
        src = filepath
    for idx, line in enumerate(src):
        if line.startswith('#'):
            continue
        line = line.strip()
        cells = line.split('\t')
        if line and cells:
            if enhanced_collapse_empty_nodes and '.' in cells[0]:
                cells[0] = float(cells[0])
                cells[6] = None
            else:
                if '-' in cells[0] or '.' in cells[0]:
                    # sent[-1][1] += cells[1]
                    continue
                cells[0] = int(cells[0])
                if cells[6] != '_':
                    try:
                        cells[6] = int(cells[6])
                    except ValueError:
                        cells[6] = 0
                        logger.exception(
                            f'Wrong CoNLL format {filepath}:{idx + 1}\n{line}')
            if underline_to_none:
                for i, x in enumerate(cells):
                    if x == '_':
                        cells[i] = None
            sent.append(cells)
        else:
            if enhanced_collapse_empty_nodes:
                sent = collapse_enhanced_empty_nodes(sent)
            yield sent
            sent = []

    if sent:
        if enhanced_collapse_empty_nodes:
            sent = collapse_enhanced_empty_nodes(sent)
        yield sent

    src.close()
Exemple #13
0
def make_ontonotes_language_jsonlines(conll12_ontonotes_path,
                                      output_path=None,
                                      language='english'):
    conll12_ontonotes_path = get_resource(conll12_ontonotes_path)
    if output_path is None:
        output_path = os.path.dirname(conll12_ontonotes_path)
    for split in ['train', 'development', 'test']:
        pattern = f'{conll12_ontonotes_path}/data/{split}/data/{language}/annotations/*/*/*/*gold_conll'
        files = sorted(glob.glob(pattern, recursive=True))
        assert files, f'No gold_conll files found in {pattern}'
        version = os.path.basename(files[0]).split('.')[-1].split('_')[0]
        if version.startswith('v'):
            assert all([version in os.path.basename(f) for f in files])
        else:
            version = 'v5'
        lang_dir = f'{output_path}/{language}'
        if split == 'conll-2012-test':
            split = 'test'
        full_file = f'{lang_dir}/{split}.{language}.{version}_gold_conll'
        os.makedirs(lang_dir, exist_ok=True)
        print(f'Merging {len(files)} files to {full_file}')
        merge_files(files, full_file)
        v5_json_file = full_file.replace(f'.{version}_gold_conll',
                                         f'.{version}.jsonlines')
        print(f'Converting CoNLL file {full_file} to json file {v5_json_file}')
        labels, stats = convert_to_jsonlines(full_file, v5_json_file, language)
        print('Labels:')
        pprint(labels)
        print('Statistics:')
        pprint(stats)
        conll12_json_file = f'{lang_dir}/{split}.{language}.conll12.jsonlines'
        print(
            f'Applying CoNLL 12 official splits on {v5_json_file} to {conll12_json_file}'
        )
        id_file = get_resource(f'http://conll.cemantix.org/2012/download/ids/'
                               f'{language}/coref/{split}.id')
        filter_data(v5_json_file, id_file, conll12_json_file)
Exemple #14
0
 def load(self, save_dir: str, devices=None, **kwargs):
     save_dir = get_resource(save_dir)
     # flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]')
     if devices is None and self.model:
         devices = self.devices
     self.load_config(save_dir, **kwargs)
     self.load_vocabs(save_dir)
     flash('Building model [blink][yellow]...[/yellow][/blink]')
     self.model = self.build_model(**merge_dict(self.config,
                                                training=False,
                                                **kwargs,
                                                overwrite=True,
                                                inplace=True))
     flash('')
     self.load_weights(save_dir, **kwargs)
     self.to(devices)
     self.model.eval()
Exemple #15
0
 def load_file(self, filepath):
     filepath = get_resource(filepath)
     # idx = 0
     for words, tags in generate_words_tags_from_tsv(filepath, lower=False):
         # idx += 1
         # if idx % 1000 == 0:
         #     print(f'\rRead instances {idx // 1000}k', end='')
         if self.max_seq_len:
             start = 0
             for short_sents in split_long_sentence_into(
                     words,
                     self.max_seq_len,
                     self.sent_delimiter,
                     char_level=self.char_level,
                     hard_constraint=self.hard_constraint):
                 end = start + len(short_sents)
                 yield {'token': short_sents, 'tag': tags[start:end]}
                 start = end
         else:
             yield {'token': words, 'tag': tags}
Exemple #16
0
    def __init__(self,
                 data: str,
                 batch_size,
                 seq_len,
                 tokenizer='char',
                 eos='\n',
                 strip=True,
                 vocab=None,
                 cache=False,
                 transform: Union[Callable, List] = None) -> None:
        self.cache = cache
        self.eos = eos
        self.strip = strip
        super().__init__(transform)
        if isinstance(tokenizer, str):
            available_tokenizers = {
                'char': ToChar('text', 'token'),
                'whitespace': WhitespaceTokenizer('text', 'token')
            }
            assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} '
            self.append_transform(available_tokenizers[tokenizer])

        if vocab is None:
            vocab = Vocab()
            self.training = True
        else:
            self.training = vocab.mutable
        self.append_transform(AppendEOS('token', eos=eos))
        self.append_transform(FieldToIndex('token', vocab))
        self.batch_size = batch_size
        data = get_resource(data)
        self.data = data
        self.num_tokens = None
        self.load_file(data)
        self._fp = None
        if isinstance(seq_len, int):
            self.seq_len = lambda: seq_len
        else:
            self.seq_len = seq_len
Exemple #17
0
def official_conll_05_evaluate(pred_path, gold_path):
    script_root = get_resource(
        'http://www.lsi.upc.edu/~srlconll/srlconll-1.1.tgz')
    lib_path = f'{script_root}/lib'
    if lib_path not in os.environ.get("PERL5LIB", ""):
        os.environ['PERL5LIB'] = f'{lib_path}:{os.environ.get("PERL5LIB", "")}'
    bin_path = f'{script_root}/bin'
    if bin_path not in os.environ.get('PATH', ''):
        os.environ['PATH'] = f'{bin_path}:{os.environ.get("PATH", "")}'
    eval_info_gold_pred = run_cmd(
        f'perl {script_root}/bin/srl-eval.pl {gold_path} {pred_path}')
    eval_info_pred_gold = run_cmd(
        f'perl {script_root}/bin/srl-eval.pl {pred_path} {gold_path}')
    conll_recall = float(
        eval_info_gold_pred.strip().split("\n")[6].strip().split()[5]) / 100
    conll_precision = float(
        eval_info_pred_gold.strip().split("\n")[6].strip().split()[5]) / 100
    if conll_recall + conll_precision > 0:
        conll_f1 = 2 * conll_recall * conll_precision / (conll_recall +
                                                         conll_precision)
    else:
        conll_f1 = 0
    return conll_precision, conll_recall, conll_f1
Exemple #18
0
 def __init__(self,
              trn: str = None,
              dev: str = None,
              tst: str = None,
              sampler_builder: SamplerBuilder = None,
              dependencies: str = None,
              scalar_mix: ScalarMixWithDropoutBuilder = None,
              use_raw_hidden_states=False,
              lr=1e-3,
              separate_optimizer=False,
              cls_is_bos=True,
              sep_is_eos=False,
              char2concept_dim=128,
              cnn_filters=((3, 256), ),
              concept_char_dim=32,
              concept_dim=300,
              dropout=0.2,
              embed_dim=512,
              eval_every=20,
              ff_embed_dim=1024,
              graph_layers=2,
              inference_layers=4,
              num_heads=8,
              rel_dim=100,
              snt_layers=4,
              unk_rate=0.33,
              vocab_min_freq=5,
              beam_size=8,
              alpha=0.6,
              max_time_step=100,
              amr_version='2.0',
              **kwargs) -> None:
     super().__init__(**merge_locals_kwargs(locals(), kwargs))
     self.vocabs = VocabDict()
     utils_dir = get_resource(get_amr_utils(amr_version))
     self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))
Exemple #19
0
 def on_config_ready(self, **kwargs):
     super().on_config_ready(**kwargs)
     utils_dir = get_resource(get_amr_utils(self.config.amr_version))
     self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))
Exemple #20
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========================================================================

# -*- coding:utf-8 -*-
# Author: hankcs
import tempfile

from elit.components.parsers.conll import read_conll
from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr

CONLLX_EVAL = get_resource(
    'https://github.com/elikip/bist-parser/archive/master.zip' +
    '#bmstparser/src/utils/eval.pl')


def evaluate(gold_file, pred_file):
    """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski)

    Args:
      gold_file(str): The gold conllx file
      pred_file(str): The pred conllx file

    Returns:

    
    """
    gold_file = get_resource(gold_file)
Exemple #21
0
 def load_weights(self, save_dir, filename='model.pt', **kwargs):
     save_dir = get_resource(save_dir)
     filename = os.path.join(save_dir, filename)
     # flash(f'Loading model: {filename} [blink]...[/blink][/yellow]')
     self.model_.load_state_dict(torch.load(filename, map_location='cpu'),
                                 strict=False)
Exemple #22
0
    def evaluate(self,
                 tst_data,
                 save_dir=None,
                 logger: logging.Logger = None,
                 batch_size=None,
                 output=False,
                 **kwargs):
        if not self.model:
            raise RuntimeError('Call fit or load before evaluate.')
        if isinstance(tst_data, str):
            tst_data = get_resource(tst_data)
            filename = os.path.basename(tst_data)
        else:
            filename = None
        if output is True:
            output = self.generate_prediction_filename(
                tst_data if isinstance(tst_data, str) else 'test.txt',
                save_dir)
        if logger is None:
            _logger_name = basename_no_ext(filename) if filename else None
            logger = self.build_logger(_logger_name, save_dir)
        if not batch_size:
            batch_size = self.config.get('batch_size', 32)
        data = self.build_dataloader(**merge_dict(self.config,
                                                  data=tst_data,
                                                  batch_size=batch_size,
                                                  shuffle=False,
                                                  device=self.devices[0],
                                                  logger=logger,
                                                  overwrite=True))
        dataset = data
        while dataset and hasattr(dataset, 'dataset'):
            dataset = dataset.dataset
        num_samples = len(dataset) if dataset else None
        if output and isinstance(dataset, TransformDataset):

            def add_idx(samples):
                for idx, sample in enumerate(samples):
                    if sample:
                        sample[IDX] = idx

            add_idx(dataset.data)
            if dataset.cache:
                add_idx(dataset.cache)

        criterion = self.build_criterion(**self.config)
        metric = self.build_metric(**self.config)
        start = time.time()
        outputs = self.evaluate_dataloader(data,
                                           criterion=criterion,
                                           filename=filename,
                                           output=output,
                                           input=tst_data,
                                           save_dir=save_dir,
                                           test=True,
                                           num_samples=num_samples,
                                           **merge_dict(self.config,
                                                        batch_size=batch_size,
                                                        metric=metric,
                                                        logger=logger,
                                                        **kwargs))
        elapsed = time.time() - start
        if logger:
            if num_samples:
                logger.info(
                    f'speed: {num_samples / elapsed:.0f} samples/second')
            else:
                logger.info(f'speed: {len(data) / elapsed:.0f} batches/second')
        return metric, outputs
Exemple #23
0
 def transform(self, **kwargs) -> Callable:
     vocab = Vocab()
     vocab.load(os.path.join(get_resource(self.path), 'vocab.json'))
     return TransformList(ContextualStringEmbeddingTransform(self.field),
                          FieldToIndex(f'{self.field}_f_char', vocab),
                          FieldToIndex(f'{self.field}_b_char', vocab))
Exemple #24
0
 def load_vocab(self, save_dir, filename='vocab.json'):
     save_dir = get_resource(save_dir)
     vocab = SerializableDict()
     vocab.load_json(os.path.join(save_dir, filename))
     self.vocab.copy_from(vocab)
Exemple #25
0
 def load_vocabs(self, save_dir, filename='vocabs.json', vocab_cls=Vocab):
     save_dir = get_resource(save_dir)
     vocabs = SerializableDict()
     vocabs.load_json(os.path.join(save_dir, filename))
     self._load_vocabs(self, vocabs, vocab_cls)
def load_from_meta_file(save_dir: str,
                        meta_filename='meta.json',
                        transform_only=False,
                        load_kwargs=None,
                        **kwargs) -> Component:
    """

    Args:
        save_dir:
        meta_filename (str): The meta file of that saved component, which stores the classpath and version.
        transform_only:
        **kwargs:

    Returns:

    """
    identifier = save_dir
    load_path = save_dir
    save_dir = get_resource(save_dir)
    if save_dir.endswith('.json'):
        meta_filename = os.path.basename(save_dir)
        save_dir = os.path.dirname(save_dir)
    metapath = os.path.join(save_dir, meta_filename)
    if not os.path.isfile(metapath):
        metapath = os.path.join(save_dir, 'config.json')
    if not os.path.isfile(metapath):
        tips = ''
        if save_dir.isupper():
            from difflib import SequenceMatcher
            similar_keys = sorted(pretrained.ALL.keys(),
                                  key=lambda k: SequenceMatcher(
                                      None, save_dir, metapath).ratio(),
                                  reverse=True)[:5]
            tips = f'Check its spelling based on the available keys:\n' + \
                   f'{sorted(pretrained.ALL.keys())}\n' + \
                   f'Tips: it might be one of {similar_keys}'
        raise FileNotFoundError(
            f'The identifier {save_dir} resolves to a non-exist meta file {metapath}. {tips}'
        )
    meta: dict = load_json(metapath)
    cls = meta.get('classpath', None)
    if not cls:
        cls = meta.get('class_path', None)  # For older version
    assert cls, f'{meta_filename} doesn\'t contain classpath field'
    try:
        obj: Component = object_from_classpath(cls)
        if hasattr(obj, 'load'):
            if transform_only:
                # noinspection PyUnresolvedReferences
                obj.load_transform(save_dir)
            else:
                if load_kwargs is None:
                    load_kwargs = {}
                if os.path.isfile(os.path.join(save_dir, 'config.json')):
                    obj.load(save_dir, **kwargs)
                else:
                    obj.load(metapath, **kwargs)
            obj.config['load_path'] = load_path
        return obj
    except Exception as e:
        eprint(f'Failed to load {identifier}. See traceback below:')
        eprint(f'{"ERROR LOG BEGINS":=^80}')
        traceback.print_exc()
        eprint(f'{"ERROR LOG ENDS":=^80}')
        from pkg_resources import parse_version
        model_version = meta.get("elit_version", "unknown")
        if model_version == '2.0.0':  # Quick fix: the first version used a wrong string
            model_version = '2.0.0-alpha.0'
        model_version = parse_version(model_version)
        installed_version = parse_version(version.__version__)
        try:
            latest_version = get_latest_info_from_pypi()
        except:
            latest_version = None
        if model_version > installed_version:
            eprint(
                f'{identifier} was created with elit-{model_version}, '
                f'while you are running a lower version: {installed_version}. '
            )
        if installed_version != latest_version:
            eprint(f'Please upgrade elit with:\n'
                   f'pip install --upgrade elit\n')
        eprint(
            'If the problem still persists, please submit an issue to https://github.com/emorynlp/elit/issues\n'
            'When reporting an issue, make sure to paste the FULL ERROR LOG above.'
        )
        exit(1)
Exemple #27
0
# -*- coding:utf-8 -*-
# Author: hankcs

import os
import tempfile
from typing import List

from elit.metrics.parsing.conllx_eval import copy_cols

from elit.common.structure import SerializableDict
from elit.metrics.parsing import iwpt20_xud_eval
from elit.metrics.parsing.iwpt20_xud_eval import load_conllu_file
from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr

UD_TOOLS_ROOT = get_resource(
    'https://github.com/UniversalDependencies/tools/archive/1650bd354bd158c75836cff6650ea35cc9928fc8.zip'
)

ENHANCED_COLLAPSE_EMPTY_NODES = os.path.join(
    UD_TOOLS_ROOT, 'enhanced_collapse_empty_nodes.pl')
CONLLU_QUICK_FIX = os.path.join(UD_TOOLS_ROOT, 'conllu-quick-fix.pl')


def run_perl(script, src, dst=None):
    if not dst:
        dst = tempfile.NamedTemporaryFile().name
    exitcode, out, err = get_exitcode_stdout_stderr(
        f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}')
    if exitcode:
        # cpanm -l ~/.local namespace::autoclean
        # cpanm -l ~/.local Moose
Exemple #28
0
    def __init__(self):
        """
        :class:`EnglishTokenizer` splits the input text into linguistic tokens.
        """
        super(EnglishTokenizer, self).__init__()

        # _inflection_lexicons
        resource_root = get_resource(ELIT_URL + 'tokenizer.zip')

        self.ABBREVIATION_PERIOD = read_word_set(
            os.path.join(resource_root, 'english_abbreviation_period.txt'))
        self.APOSTROPHE_FRONT = read_word_set(
            os.path.join(resource_root, 'english_apostrophe_front.txt'))
        self.MAP_CONCAT_WORD = read_concat_word_dict(
            os.path.join(resource_root, 'english_concat_words.txt'))
        self.HYPHEN_PREFIX = read_word_set(
            os.path.join(resource_root, 'english_hyphen_prefix.txt'))
        self.HYPHEN_SUFFIX = read_word_set(
            os.path.join(resource_root, 'english_hyphen_suffix.txt'))

        # regular expressions
        self.RE_NETWORK_PROTOCOL = re.compile(
            r'((http|https|ftp|sftp|ssh|ssl|telnet|smtp|pop3|imap|imap4|sip)(://))'
        )
        """
        :abc:
        <3 </3 <\3
        (: ): \\: *: $: (-: (^: (= (;
        :) :( =) B) 8) :-) :^) :3 :D :p :| :(( :---)
        """
        self.RE_EMOTICON = re.compile(
            r'(:\w+:|<[\\/]?3|[()\\|*$][-^]?[:=;]|[:=;B8]([-^]+)?[3DOPp@$*()\\/|]+)(\W|$)'
        )
        """
        [email protected]
        [email protected]
        [email protected]
        jinho:[email protected]
        """
        self.RE_EMAIL = re.compile(
            r'[\w\-.]+(:\S+)?@(([A-Za-z0-9\-]+\.)+[A-Za-z]{2,12}|\d{1,3}(\.\d{1,3}){3})'
        )
        """
        &arrow;
        &#123; &#x123; &#X123;
        """
        self.RE_HTML_ENTITY = re.compile(r'&([A-Za-z]+|#[Xx]?\d+);')
        """
        [1] (1a) {A} <a1> [***] [A.a] [A.1] [1.a] ((---))
        """
        self.RE_LIST_ITEM = re.compile(
            r'(([\[({<]+)(\d+[A-Za-z]?|[A-Za-z]\d*|\W+)(\.(\d+|[A-Za-z]))*([\])\}>])+)'
        )
        """
        don't don’t I'll HE'S
        """
        self.RE_APOSTROPHE = re.compile(
            r'(?i)[a-z](n[\'\u2019]t|[\'\u2019](ll|nt|re|ve|[dmstz]))(\W|$)')
        """
        a.b.c 1-2-3
        """
        self.RE_ABBREVIATION = re.compile(r'[A-Za-z0-9]([.-][A-Za-z0-9])*$')
        """
        10kg 1cm
        """
        self.RE_UNIT = re.compile(
            r'(?i)(\d)([acdfkmnpyz]?[mg]|[ap]\.m|ch|cwt|d|drc|ft|fur|gr|h|in|lb|lea|mi|ms|oz|pg|qtr|yd)$'
        )
        """
        hello.World
        """
        self.RE_FINAL_MARK_IN_BETWEEN = re.compile(
            r'([A-Za-z]{3,})([.?!]+)([A-Za-z]{3,})$')
Exemple #29
0
def make_ner_tsv_if_necessary(json_file):
    json_file = get_resource(json_file)
    output_file = os.path.splitext(json_file)[0] + '.ner.tsv'
    if not os.path.isfile(output_file):
        convert_jsonlines_to_IOBES(json_file, output_file)
    return output_file