def load_tagger(model_path):
    """Loads tagger from a CRFSUITE binary model file.

    :param str model_path: path to the binary model file.

    """
    tagger = Tagger()
    tagger.open(model_path)
    return tagger
Example #2
0
    def _load_tagger(self):
        # In pycrfsuite, you have to save the model first, then load it as a tagger
        self.model_name = 'model_{}'.format(self.task_obj.unique_id)
        file_path = os.path.join(MODELS_DIR, self.task_type, self.model_name)
        try:
            tagger = Tagger()
            tagger.open(file_path)
        except Exception as e:
            print(e)
            logging.getLogger(ERROR_LOGGER).error('Failed to load crf model from the filesystem.', exc_info=True, extra={
                'model_name': self.model_name,
                'file_path':  file_path})

        self.tagger = tagger
        return self.tagger
Example #3
0
def gen(corpus=test, model='m.model', indir=INDIR, outdir=''):
    tagger = Tagger()
    tagger.open(model)
    for doc in corpus.documents:
        path = setup_newdir(doc.filepath, olddir=indir, newdir=outdir,
                            suffix='--', renew=True)
        if not path:
            continue
        mkparentdirs(path)
        task = etree.Element(TASK_ROOT)
        tags = etree.Element(TAGS_ROOT)
        tokens = etree.Element(TOKENS_ROOT)
        task.append(tags)
        task.append(tokens)
        sents = doc.sentences
        seqs = doc.sequence_list()
        tagged_seqs = [tagger.tag(seq) for seq in seqs]
        freq_dict = defaultdict(int)
        for (sent, seq, tagged_seq) in zip(sents, seqs, tagged_seqs):
            s = etree.Element('s')
            for (lex, feat, label) in zip(sent.getchildren(), seq, tagged_seq):
                    lex_tag = etree.Element(lex.tag, lex.attrib)
                    lex_tag.text = lex.text
                    s.append(lex_tag)
                    if label != 'None':
                        iso_tag = etree.Element(label)
                        if label in attribs:
                            for key in attribs[label]:
                                iso_tag.attrib[key] = attribs[label][key]
                        iso_tag.attrib['text'] = lex.text
                        iso_tag.attrib['id'] = ids[label] + str(freq_dict[label])
                        lex_tag.attrib['id'] = iso_tag.attrib['id']
                        freq_dict[label] += 1
                        tags.append(iso_tag)
            tokens.append(s)
        s = etree.tostring(task, pretty_print=True)
        with open(path, 'w') as f:
            print>>f, HEADER
            print>>f, s
Example #4
0
def test_open_close_labels(model_filename, yseq):
    tagger = Tagger()

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()

    with tagger.open(model_filename):
        labels = tagger.labels()
    assert set(labels) == set(yseq)

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()
Example #5
0
class PassageTagger(object):
  def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
    self.trained_model_name = trained_model_name
    self.fp = FeatureProcessing()
    self.do_train = do_train
    self.algorithm = algorithm
    if algorithm == "crf":
      if do_train:
        self.trainer = Trainer()
      else:
        self.tagger = Tagger()
    else:
      if do_train:
        model = ChainCRF()
        self.trainer = FrankWolfeSSVM(model=model)
        self.feat_index = {}
        self.label_index = {}
      else:
        self.tagger = pickle.load(open(self.trained_model_name, "rb"))
        self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
        label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
        self.rev_label_index = {i: x for x, i in label_index.items()}

  def read_input(self, filename):
    str_seqs = []
    str_seq = []
    feat_seqs = []
    feat_seq = []
    label_seqs = []
    label_seq = []
    for line in codecs.open(filename, "r", "utf-8"):
      lnstrp = line.strip()
      if lnstrp == "":
        if len(str_seq) != 0:
          str_seqs.append(str_seq)
          str_seq = []
          feat_seqs.append(feat_seq)
          feat_seq = []
          label_seqs.append(label_seq)
          label_seq = []
      else:
        if self.do_train:
          clause, label = lnstrp.split("\t")
          label_seq.append(label)
        else:
          clause = lnstrp
        str_seq.append(clause)
        feats = self.fp.get_features(clause)
        feat_dict = {}
        for f in feats:
          if f in feat_dict:
            feat_dict[f] += 1
          else:
            feat_dict[f] = 1
        #feat_dict = {i: v for i, v in enumerate(feats)}
        feat_seq.append(feat_dict)
    if len(str_seq) != 0:
      str_seqs.append(str_seq)
      str_seq = []
      feat_seqs.append(feat_seq)
      feat_seq = []
      label_seqs.append(label_seq)
      label_seq = []
    return str_seqs, feat_seqs, label_seqs

  def predict(self, feat_seqs):
    print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      self.tagger.open(self.trained_model_name)
      preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs]
    else:
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            if f in self.feat_index:
              x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))
      pred_ind_seqs = self.tagger.predict(Xs)
      preds = []
      for ps in pred_ind_seqs:
        pred = []
        for pred_ind in ps:
          pred.append(self.rev_label_index[pred_ind])
        preds.append(pred)
    return preds

  def train(self, feat_seqs, label_seqs):
    print >>sys.stderr, "Training on %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      for feat_seq, label_seq in zip(feat_seqs, label_seqs):
        self.trainer.append(ItemSequence(feat_seq), label_seq)
      self.trainer.train(self.trained_model_name)
    else:
      for fs in feat_seqs:
        for feat_dict in fs:
          for f in feat_dict:
            if f not in self.feat_index:
              self.feat_index[f] = len(self.feat_index)
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))

      for ls in label_seqs:
        for label in ls:
          if label not in self.label_index:
            self.label_index[label] = len(self.label_index)

      Ys = []
      for ls in label_seqs:
        Y = []
        for label in ls:
          Y.append(self.label_index[label])
        Ys.append(numpy.asarray(Y))

      self.trainer.fit(Xs, Ys)
      pickle.dump(self.trainer, open(self.trained_model_name, "wb"))
      pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb"))
      pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
Example #6
0
def test_open_invalid_with_correct_signature(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b"lCRFfoo"*100)
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp))
Example #7
0
def test_open_invalid_small(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b'foo')
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp))
Example #8
0
def test_open_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(__file__)
Example #9
0
def test_open_non_existing():
    tagger = Tagger()
    with pytest.raises(IOError):
        tagger.open('foo')
Example #10
0
def evaluate_model_by_story(model_name, test_samples):
    model = Tagger()
    model.open(model_name)

    story_fps = dict()
    for sample in test_samples:
        model.set(build_model_features(sample, 17, True))
        predicted_labels = model.tag()

        chars = list(sample.sentence)
        predicted_fps = []
        fp = ''
        for index, word in enumerate(predicted_labels):
            if word == 'E' or word == 'S':
                fp += chars[index]
                predicted_fps.append(fp)
                fp = ''
            if word == 'B' or word == 'I':
                fp += chars[index]

        actual_fps = [fp for fp in sample.fps if fp != '' and fp != 'null' and fp in sample.sentence]

        filtered_predicted_fps = predicted_fps
        # for predicted_fp in predicted_fps:
        #     lan_confidence_temp = lmmodel.score(predicted_fp, bos=True, eos=True) / len(predicted_fp)
        #     if len(re.findall('[a-zA-Z0-9+]+', predicted_fp)) > 0:
        #         lan_confidence_temp += 5
        #     if lan_confidence_temp > -2.4:
        #         filtered_predicted_fps.append(predicted_fp)

        if sample.story_id not in story_fps:
            story_fps[sample.story_id] = [set(actual_fps), set(filtered_predicted_fps)]
        else:
            story_fps[sample.story_id][0].update(actual_fps)
            story_fps[sample.story_id][1].update(filtered_predicted_fps)

    # print(len(story_fps))
    global sim_t
    sim_threshold = sim_t

    TP_precision = 0
    TP_recall = 0
    all_actual_fps = 0
    all_predicted_fps = 0
    for story_id, (actual_fps, predicted_fps) in story_fps.items():
        story_precision = 0.0
        story_recall = 0.0

        all_actual_fps += len(actual_fps)

        all_predicted_fps += len(predicted_fps)
        # for actual_fp in actual_fps:

        story = samples_dao.read_story_by_story_id(int(story_id))
        data = [story_id,
                story[0] if story is not None else '',
                story[1] if story is not None else '',
                story[2] if story is not None else '',
                story[3] if story is not None else '',
                story[4] if story is not None else '',
                actual_fps,
                predicted_fps]
        with open('../Archive/date_performance/resultsIterRes_by_story_details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(data)
        for predicted_fp in predicted_fps:
            sim = []
            for actual_fp in actual_fps:
                similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1)
                # if actual_fp in predicted_fp:
                #     similarity = 1
                sim.append(similarity)
            # print(sim)

            if len(sim) == 0:
                sim = [0]
            if max(sim) >= sim_threshold:
                TP_precision += 1
                story_precision += 1

        for actual_fp in actual_fps:
            sim = []
            for predicted_fp in predicted_fps:
                similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1)
                sim.append(similarity)
            # print(sim)
            if len(sim) == 0:
                sim = [0]
            if max(sim) >= sim_threshold:
                TP_recall += 1
                story_recall += 1

        # 每个故事的详情
        story_precision = 0 if len(filtered_predicted_fps) == 0 else story_precision/len(filtered_predicted_fps)
        story_recall = 0 if len(actual_fps) == 0 else story_recall/len(actual_fps)
        data = ["STORY " + story_id, story_precision, story_recall]
        with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(data)
    with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(["THE END!!!"])

    # 整体的详情
    precision = TP_precision/all_predicted_fps
    recall = TP_recall/all_actual_fps
    f1 = 2 * precision * recall / (precision + recall)

    print("By Story: Iteration: %s\n\tPrecision: %f\n\tRecall: %f\n\tF1: %f\n\n\n"
          % (model_name.split('_')[2], precision, recall, f1))

    data = ["BY STORY: Iteration " + model_name.split('_')[2], precision, recall, f1]

    with open('../Archive/date_performance/results/IterRes_by_story.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    return precision, recall, f1
Example #11
0
def evaluate_model(model_name, test_samples):
    '''
    最后一次迭代训练模型,并输出测试结果
    :param test_samples:
    :param model_name:
    :return:
    '''
    model = Tagger()
    model.open(model_name)

    accuracy = 0.0
    recall = 0.0
    f1 = 0.0
    # sample_accuracy = 0.0
    iteration_test_details = []
    for sample in test_samples:
        model.set(build_model_features(sample, 17, True))
        predicted_labels = model.tag()
        true_labels = sample.char_label

        predicted_label_index = []
        for predicted_label in predicted_labels:
            if predicted_label == 'N':
                predicted_label_index.append(0)
            else:
                predicted_label_index.append(1)

        true_label_index = []
        for true_label in true_labels:
            if true_label == 'N':
                true_label_index.append(0)
            else:
                true_label_index.append(1)

        iteration_test_details = []
        chars = list(sample.sentence)
        # sen_words = sample.sen_words
        iteration_test_details.append(sample.sentence)
        predicted_fps = ''
        actual_fps = ''
        for index, word in enumerate(predicted_labels):
            if word != 'N':
                predicted_fps += chars[index]
        if len(predicted_fps) == 0:
            predicted_fps = '-----'

        for index, word in enumerate(true_labels):
            if word != 'N':
                actual_fps += chars[index]

        iteration_test_details.append(actual_fps)
        iteration_test_details.append(predicted_fps)

        with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(iteration_test_details)

        # print(sample.sen_words)
        # print(predicted_labels)
        # print(true_labels)

        accuracy += metrics.accuracy_score(true_label_index, predicted_label_index)
        recall += metrics.recall_score(true_label_index, predicted_label_index, average='binary', pos_label=1)
        f1 += 2*accuracy*recall/(accuracy+recall)
        # sample_accuracy += metrics.sequence_accuracy_score(true_labels, predicted_labels)

    print("Iteration: %s\n\tAccuracy: %f\n\tRecall: %f\n\tF1: %f\n\n\n"
          % (
              model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples),
              f1 / len(test_samples)))

    data = ["Iteration " + model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples),
            f1 / len(test_samples)]

    with open('../Archive/date_performance/results/IterRes.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    return accuracy / len(test_samples), recall / len(test_samples), f1 / len(test_samples)
 def __init__(self, tagger=None):
     if not tagger:
         tagger = Tagger()
         tagger.open(TOKENIZATION_MODEL_PATH)
     self.tagger = tagger
Example #13
0
class CRFSTagger:
    def __init__(self,
                 cfg=None,
                 mp=None,
                 fnx=None,
                 win_fnx=None,
                 cols=None,
                 verbose=False):
        """Creates an instance of CRFSTagger

        :param cfg: configuration
        :type cfg: ConfigParser.ConfigParser
        :param mp: model path
        :type mp: str
        """

        # configuration
        self.cfg = None

        # feature template
        self.ft_tmpl = None

        # list of resources used by features, e.g. word clusters, embeddings
        self.resources = None

        # data
        self.train_data = None
        self.test_data = None

        # instance of pycrfsuite.Tagger
        self.tagger = None

        self.verbose = verbose

        # attempt to import cannonical replacements
        try:
            import canonical
            self.canonical = canonical.REPLACEMENTS
        except ImportError:
            self.canonical = None

        self.fnx = fnx
        self.win_fnx = win_fnx
        self.ft_tmpl_cols = cols

        # load data and resources if configuration is provided
        if cfg:

            self.cfg = cfg
            expandpaths(self.cfg)

            # loading resources (clusters, embeddings, etc.)
            self._load_resources()

            # loading data
            self._load_data()

        # load model
        elif mp:
            m = pickle.load(open(mp, 'r'))
            self.cfg = m.cfg
            self.cfg.set('tagger', 'model', mp)
            self.resources = m.resources
            self.fnx = [self._load_function(n, f)
                        for n, f in m.fnx.items()] if m.fnx else None
            self.win_fnx = [self._load_function(n, f)
                            for n, f in m.win_fnx] if m.win_fnx else None
            self.ft_tmpl_cols = m.cols
        else:
            raise RuntimeError(
                'Configuration initialisation failed. Please, provide either '
                'a configuration or a model.')

        # parsing feature template
        self.ft_tmpl = FeatureTemplate(fnx=self.fnx,
                                       win_fnx=self.win_fnx,
                                       cols=self.ft_tmpl_cols)
        self.ft_tmpl.parse_ftvec_templ(self.cfg_tag.get('ftvec'),
                                       self.resources)

    @property
    def cfg_tag(self):
        """Configuration parameters of this tagger. Returns a section from a
        ConfigParser object.


        :return: tagger configuration
        :rtype: dict
        """
        return dict(self.cfg.items('tagger'))

    @property
    def cfg_crf(self):
        """Configuration parameters for CRFSuite. These are passed to the tagger
        instance when training is done. Note, these are not necessarily the
        same as the ones in self.tagger.params.


        :return: CRFSuite configuration
        :rtype: dict
        """
        return dict(self.cfg.items('crfsuite'))

    @property
    def cfg_res(self):
        """Resources configuration. Essentially a list of name and file path
        pairs.


        :return: list of resources
        :rtype: dict
        """
        return dict(self.cfg.items('resources'))

    ############################################################################
    ### A group of properties mapped to configuration values of the tagger.  ###
    ############################################################################
    ############################################################################

    @property
    def ts(self):
        tss = {'\\t': '\t', '\\s': ' '}
        return tss.get(self.cfg_tag['tab_sep'], self.cfg_tag['tab_sep'])

    @property
    def cols(self):
        return self.cfg_tag['cols']

    @property
    def form_col(self):
        return self.cfg_tag.get('form_col', 'form')

    @property
    def lbl_col(self):
        return self.cfg_tag['label_col']

    @property
    def ilbl_col(self):
        return self.cfg_tag.get('guess_label_col', 'guesstag')

    @property
    def model_path(self):
        return self.cfg_tag['model']

    @property
    def eval_func(self):
        return getattr(eval, '%s' % self.cfg_tag['eval_func'])

    @property
    def info(self):
        return self.tagger.info if self.tagger else None

    ############################################################################
    ############################################################################

    def _load_resources(self):
        """Loads resources listed in the `resources` section of the
        configuration. Resources are generally needed for feature generation.
        However, note that for a resource to be loaded a `reader` method
        is needed. For example, to load a clusters resource `cls`, there needs
        to be a method called `read_cls` in `readers.py` that takes a file path
        parameter and returns a resource data structure.
        """
        self.resources = {}
        for n, p in self.cfg_res.items():
            self.resources[n] = getattr(readers, 'read_%s' % n)(p)

    def _load_data(self):
        """Loads training and testing data if provided in the initial
        configuration.
        """
        if 'train' in self.cfg_tag and self.cfg_tag['train']:
            self.train_data = parse_tsv(self.cfg_tag['train'],
                                        cols=self.cols,
                                        ts=self.ts)

        if 'test' in self.cfg_tag and self.cfg_tag['test']:
            self.test_data = parse_tsv(fp=self.cfg_tag['test'],
                                       cols=self.cols,
                                       ts=self.ts)

    def _load_function(self, name, code_string):
        code = marshal.loads(code_string)
        return types.FunctionType(code, globals(), name)

    def _extract_features(self, doc, form_col='form'):
        """A generator methof that extracts features from the data using a
        feature set template. Yields the feature vector of each sequence in the
        data.

        :param doc: data
        :type doc: np.recarray
        """
        d = copy.deepcopy(doc)

        # replace tokens with canonical forms
        if self.canonical:
            for t in d:
                for r in self.canonical.keys():
                    if re.match(r, t['form']):
                        t['form'] = self.canonical[r]

        # number of features
        nft = len(self.ft_tmpl.vec)

        # record count
        rc = len(d)

        # recarray data types (60 >= char string, [30 >= char string] * nft)
        dt = 'a60,{}'.format(','.join('a30' for _ in range(nft)))

        # constructing empty recarray
        fts = np.zeros(rc, dtype=dt)

        # sequence start and end indices
        s, e = 0, 0

        sc = 0

        # extracting features sequences by sequence
        while 0 <= s < len(d):

            # index of the end of a sequence is recorded at the beginning
            e = d[s]['eos']

            # slicing a sequence
            seq = d[s:e]

            ft_seq = np.zeros(len(seq), dtype=dt)

            # extracting the features
            for i in range(len(seq)):
                ft_seq[i] = tuple(
                    self.ft_tmpl.make_fts(seq, i, form_col=form_col))

            # moving the start index
            s = e

            sc += 1

            # yielding a feature sequence
            yield ft_seq

    def train(self,
              data=None,
              form_col=None,
              lbl_col=None,
              ilbl_col=None,
              data_cols=None,
              data_sep=None,
              dump=True):
        """Trains a model based on provided data and features. The default
        behaviour is to load training parameters from the global configuration,
        unless they are passed to this method.

        IMPORTANT: there are two ways to pass data directly through the `data`
        parameter:

        -- np.recarray  `data` needs to be a recarray with column names that
                        match what the feature extractor expects.
        -- csv str      `data` needs to contain a TSV/CSV formatted string.
                        Column names and separator should be provided in the
                        `data_cols` and `data_sep` parameters. They should still
                        match what is expected by the feature extractor.

        The observation, label, and inference column names can be set through
        the global configuration using the following parameter names:
        `form_col`, `label_col`, `guess_label_col`. The default observation
        column name is `fc`, and the inference column name is `guesstag`.
        All three names can be passed to this method to override global
        configuration. Any other column names need to match their respective
        feature extractor functions, e.g. part-of-speech tags need to be placed
        in `postag` column. See `ftex.FeatureTemplate` for others.

        RECOMMENDED: use `utils.parse_tsv` to parse input data to avoid column
        configuration errors.

        NOTE: Due to the way `pycrfsuite` works, the crfsuite model needs to be
        dumped on the hard drive, however, the CRFSuiteTagger model does not
        NEED to be dumped. That process is controlled through the `dump`
        parameter.

        :param data: training data
        :type data: np.recarray or str
        :param form_col: fc column name
        :type form_col: str
        :param lbl_col: label column name
        :type lbl_col: str
        :param ilbl_col: inference label column name
        :type ilbl_col: str
        :param data_cols: list of columns in the data
        :type data_cols: str
        :param data_sep: data tab separator
        :type data_sep: str
        :param dump: dumps the model at specified location if True
        :type dump: bool
        """

        # overriding parameters
        fc = form_col if form_col else self.form_col
        c = data_cols if data_cols else self.cols
        sep = data_sep if data_sep else self.ts
        lc = lbl_col if lbl_col else self.lbl_col
        ilc = ilbl_col if ilbl_col else self.ilbl_col

        if type(data) in [np.core.records.recarray, np.ndarray]:
            d = data
        elif type(data) == str:
            d = parse_tsv(s=data, cols=c, ts=sep, inference_col=ilc)
        elif data is None:
            d = self.train_data
        else:
            raise ValueError('Invalid input type.')

        # extract features
        X = self._extract_features(d, fc)

        # extract labels
        y = gsequences(d, [lc])

        trainer = Trainer(verbose=self.verbose)

        # setting CRFSuite parameters
        trainer.set_params(self.cfg_crf)

        for x_seq, y_seq in zip(X, y):
            trainer.append(x_seq, [l[0] for l in y_seq])

        crfs_mp = '%s.crfs' % self.model_path
        try:
            makedirs(dirname(crfs_mp))
        except OSError:
            pass
        trainer.train(crfs_mp)

        self.tagger = Tagger()
        self.tagger.open(crfs_mp)

        # dumps the model
        if dump:
            self.dump_model(self.model_path)
            pickle.dump(self.cfg, open('%s.cfg.pcl' % self.model_path, 'w'))

    def tag(self,
            data,
            form_col=None,
            ilbl_col=None,
            tagger=None,
            cols=None,
            ts=None):
        """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model.

        See documentation for `train` for more details on requirements for the
        data passed to this method.

        :param data: data
        :type data: str or recarray
        :param form_col: form column name
        :type form_col: str
        :param ilbl_col: inference label column name
        :type ilbl_col: str
        :param tagger: CRFS tagger
        :type tagger: Tagger
        :param cols: TSV column names
        :type cols: str or list of str
        :param ts: tab separator for TSV
        :type ts: str
        :return: tagged data
        :rtype: recarray
        """

        fc = form_col if form_col else self.form_col
        c = cols if cols else self.cols
        sep = ts if ts else self.ts
        ilc = ilbl_col if ilbl_col else self.ilbl_col

        if type(data) in [np.core.records.recarray, np.ndarray]:
            d = data
        elif type(data) == str:
            d = parse_tsv(s=data, cols=c, ts=sep)
        else:
            raise ValueError('Invalid input type.')

        tgr = tagger

        if tgr is None and self.tagger:
            tgr = self.tagger
        elif tgr is None:
            tgr = Tagger()
            tgr.open('%s.crfs' % self.model_path)

        # extracting features
        X = self._extract_features(d, form_col=fc)

        # tagging sentences
        idx = 0
        for fts in X:
            for l in tgr.tag(fts):
                d[idx][ilc] = l
                idx += 1

        return d

    def test(self,
             data=None,
             form_col=None,
             ilbl_col=None,
             tagger=None,
             cols=None,
             ts=None,
             eval_func=None):
        """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model and
        evaluates the results.

        See documentation for `train` for more details on requirements for the
        data passed to this method.

        :param data: data
        :type data: str or recarray
        :param form_col: form column name
        :type form_col: str
        :param ilbl_col: inference label column name
        :type ilbl_col: str
        :param tagger: CRFS tagger
        :type tagger: Tagger
        :param cols: TSV column names
        :type cols: str or list of str
        :param ts: tab separator for TSV
        :type ts: str
        :param eval_func: evaluation function name [pos, conll, bio]
        :type eval_func: str
        :return: results and tagged data pair
        :rtype: AccuracyResults, np.recarray
        """

        # use provided data or testing data from config file
        d = self.test_data if data is None else data

        # setting inference label column name
        ilc = self.ilbl_col if ilbl_col is None else ilbl_col

        # tagging
        d = self.tag(d,
                     form_col=form_col,
                     ilbl_col=ilbl_col,
                     tagger=tagger,
                     cols=cols,
                     ts=ts)

        # evaluating
        f = eval_func if eval_func else self.eval_func
        r = f(d, label_col=self.lbl_col, inference_col=self.ilbl_col)

        # returnning AccuracyResults and np.recarray tagged data
        return r, d

    def dump_model(self, fp):
        """Dumps the CRFSuiteTagger model in provided file path `fp`.

        The dumping consists of two files: <fp> and <fp>.crfs. The first
        contains the configuration and all feature extraction resources needed
        by a CRFSuiteTagger object to replicate this one. The second one is the
        pycrfsuite model that needsto be dumped separately as it is always read
        from the file system.

        :param fp: model file path
        :type fp: str
        """
        md = Model()
        md.cfg = clean_cfg(self.cfg)
        md.resources = self.resources
        md.fnx = {f.__name__: marshal.dumps(f.func_code)
                  for f in self.fnx} if self.fnx else None
        md.win_fnx = {
            f.__name__: marshal.dumps(f.func_code)
            for f in self.win_fnx
        } if self.win_fnx else None
        md.cols = self.ft_tmpl_cols
        fpx = expanduser(fp)
        try:
            makedirs(dirname(fpx))
        except OSError:
            pass
        pickle.dump(md, open(fpx, 'w'))
        if fpx != self.model_path:
            src = '%s.crfs' % self.model_path
            trg = '%s.crfs' % fpx
            try:
                makedirs(dirname(trg))
            except OSError:
                pass
            shutil.copy(src, trg)
Example #14
0
def test(features):
    print("Testing..")
    tagger = Tagger()
    tagger.open('crf.model')
    y_pred = [tagger.tag(xseq) for xseq in features]
    return y_pred