class CRFModel(object):
    def __init__(self,
                 algorithm='lbfgs',
                 c1=0.1,
                 c2=0.1,
                 max_iterations=100,
                 all_possible_transitions=False):

        self.model = CRF(algorithm=algorithm,
                         c1=c1,
                         c2=c2,
                         max_iterations=max_iterations,
                         all_possible_transitions=all_possible_transitions)

    def train(self, sentences, tag_lists, tagged=False):
        if tagged:
            features = [sent2features_tagged(s) for s in sentences]
        else:
            features = [sent2features(s) for s in sentences]
        self.model.fit(features, tag_lists)

    def test(self, sentences, tagged=False):
        if tagged:
            features = [sent2features_tagged(s) for s in sentences]
            pred_tag_lists = self.model.predict(features)
        else:
            features = [sent2features(s) for s in sentences]
            pred_tag_lists = self.model.predict(features)
        return pred_tag_lists
def crf_tag():
    brown_tagged_sents = brown.tagged_sents(categories='news')
    #print(brown_tagged_sents[0])
    train_len = int(len(brown_tagged_sents) * 0.9)
    training_sentences = brown_tagged_sents[:train_len]
    test_sentences = brown_tagged_sents[train_len:]

    X_train, y_train = transform_to_dataset(training_sentences)
    X_test, y_test = transform_to_dataset(test_sentences)

    #print(len(X_train))
    #print(len(X_test))
    print(X_train[0])
    print(y_train[0])

    model = CRF()
    model.fit(X_train, y_train)

    raw_sent = ['I', 'am', 'a', 'student']
    sent_feat = [
        feature_extract(raw_sent, index) for index in range(len(raw_sent))
    ]
    print(list(zip(raw_sent, model.predict([sent_feat])[0])))
    y_pred = model.predict(X_test)
    print(metrics.flat_accuracy_score(y_test, y_pred))
Beispiel #3
0
class CRFNER(object):
    """ A class to get reviews for products on Amazon """
    def __init__(self, gazetteer, fraction=0.7):
        self.gazateer = gazetteer
        self.fraction = fraction

    def train(self, documents):
        self.data = ner_processing.NERFormatter(self.gazateer, documents)
        d_train, d_test = ner_processing.train_test_NER(self.data)

        self.X_train, self.X_test, self.y_train, self.y_test = crf_processing.feature_extraction(
            d_train, d_test)

        self.model = CRF(algorithm='lbfgs',
                         c1=0.31,
                         c2=0.02,
                         max_iterations=100,
                         all_possible_transitions=True)

        self.model.fit(self.X_train, self.y_train)

    def predict(self, sentence):
        """Transforms a single sentence (for NER testing) into a CRF-suite format"""

        sentence_split = nltk.word_tokenize(sentence)
        n_words = [0] * len(sentence_split)

        df_pred = pd.DataFrame({
            'word':
            sentence_split,
            'sentence_no':
            n_words,
            'category':
            n_words,
            'POS': [x[-1] for x in nltk.pos_tag(sentence_split)],
        })

        getter = crf_processing.SentenceGetter(df_pred)
        sent = getter.get_next()
        sentences = getter.sentences

        self.X = [crf_processing.sent2features(s) for s in sentences]
        return self.model.predict(self.X)

    def report(self):
        labels = list(self.model.classes_)

        y_pred = self.model.predict(self.X_test)
        print('F1 score {}'.format(
            metrics.flat_f1_score(self.y_test,
                                  y_pred,
                                  average='weighted',
                                  labels=labels)))

        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
        print(
            metrics.flat_classification_report(self.y_test,
                                               y_pred,
                                               labels=sorted_labels,
                                               digits=3))
Beispiel #4
0
class CRFNerModel(object):
    def __init__(self, is_save=False):
        self.crf = CRF(algorithm='lbfgs',
                       c1=0.1,
                       c2=0.1,
                       max_iterations=100,
                       all_possible_transitions=True)
        self.is_save = is_save
        self.save_model = "crf.model"

    def fit(self, train_x, train_y):
        self.crf.fit(train_x, train_y)

        if self.is_save:
            self.dump_model()

    def predict(self, input_x):
        input_x = list(input_x)
        input_feature = [sent2features(input_x)]
        return self.crf.predict(input_feature)

    def dump_model(self):
        model_data = pickle.dumps(self.crf)
        with open(self.save_model, "wb") as f:
            f.write(model_data)

    def load_model(self):
        with open(self.save_model, "rb") as f:
            model_data = f.read()
        self.crf = pickle.loads(model_data)

    def predict_list(self, input_list):
        return self.crf.predict(input_list)

    def extract_ner(self, input_x):
        extract_ner = []
        res = self.predict(input_x)

        start = None
        label = None
        for i, x in enumerate(res[0]):
            if x == "O":
                if start is not None:
                    extract_ner.append((start, i, label, input_x[start:i]))
                    start = None
                    label = None
            else:
                xindex, xlabel = x.split("-")
                if xindex == "B":
                    if start is not None:
                        extract_ner.append((start, i, label, input_x[start:i]))
                    start = i
                    label = xlabel
                else:
                    if label != xlabel:
                        start = None
                        label = None
        return extract_ner
def test_accuracy(training_dir_path, test_dir_path, is_Convo_label):
    global labels, type_labels
    curr_labels = {}
    if is_Convo_label:
        curr_labels = labels
    else:
        curr_labels = type_labels

    # Get the training Data
    x_train, y_train = get_conversation_data(training_dir_path, True, is_Convo_label)
    print("Loaded Training Data")

    # Get Testing Data
    x_test, y_test = get_conversation_data(test_dir_path, False, is_Convo_label)
    print("Loaded Testing Data")

    crf = CRF(algorithm='l2sgd',
              c2=0.001,
              max_iterations=100,
              all_possible_transitions=False)

    crf.fit(x_train, y_train)

    y_prediction = crf.predict(x_test)

    predictions = np.array([curr_labels[tag] for row in y_prediction for tag in row])
    truths = np.array([curr_labels[tag] for row in y_test for tag in row])

    # Print Metrics
    if is_Convo_label:
        print(classification_report(
            truths, predictions,
            target_names=['REQ', 'ANSW', 'COMPLIM', 'ANNOU', 'THK', 'RESPOS', 'APOL', 'RCPT']))

    # Get test accuracy
    test_ = str(accuracy_score(truths, predictions))
    # for w in sorted(crf.transition_features_, key=crf.transition_features_.get, reverse=True):
    #     print(str(w) + ":" + str(crf.transition_features_[w]))

    # Testing on training data without label
    x_test, y_test = get_conversation_data(training_dir_path, False, is_Convo_label)
    y_prediction = crf.predict(x_test)

    predictions = np.array([curr_labels[tag] for row in y_prediction for tag in row])
    truths = np.array([curr_labels[tag] for row in y_test for tag in row])

    sf = crf.state_features_
    print(type(sf))
    # Get train accuracy
    train_ = str(accuracy_score(truths, predictions))
    return test_, train_
Beispiel #6
0
def test_crf(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    y_pred = crf.predict([xseq])
    if algorithm != "ap":  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]
Beispiel #7
0
def train(file_path: str):
    """
    Training CRF model from a given ``file_path``
    """
    addresses = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            addresses.append(obj)
    addresses_train, addresses_val = train_test_split(addresses,
                                                      test_size=0.25,
                                                      random_state=42)

    X_train, y_train = addresses_to_features(addresses_train)
    X_val, y_val = addresses_to_features(addresses_val)

    crf = CRF(c1=0.2,
              c2=0.2,
              max_iterations=100,
              all_possible_transitions=True)
    crf.fit(X_train, y_train)

    # prediction score on validation set
    y_pred = crf.predict(X_val)
    metrics.flat_f1_score(y_val,
                          y_pred,
                          average='weighted',
                          labels=[l for l in LABELS if l != 'O'])
    return crf
def parameter_tuning(args, dataset):
    c1s = experiment_util.get_param_list(args.c1)
    c2s = experiment_util.get_param_list(args.c2)
    best_valid_f1_score = -np.inf
    best_c1 = -np.inf
    best_c2 = -np.inf
    best_model = None
    for c1 in c1s:
        for c2 in c2s:
            crf = CRF(algorithm='lbfgs',
                      c1=c1,
                      c2=c2,
                      max_iterations=500,
                      all_possible_transitions=True,
                      verbose=args.debug)
            crf.fit(dataset.training.list_of_feature_dicts,
                    dataset.training.list_of_labels)
            preds = crf.predict(dataset.validation.list_of_feature_dicts)
            valid_f1_score = metrics.flat_f1_score(
                dataset.validation.list_of_labels, preds, average='micro')
            if valid_f1_score > best_valid_f1_score:
                best_valid_f1_score = valid_f1_score
                best_c1 = c1
                best_c2 = c2
                best_model = crf
    print('Best validation F1 score:', best_valid_f1_score, 'Best c1:',
          best_c1, 'Best c2:', best_c2)
    return best_model
Beispiel #9
0
Datei: CRF.py Projekt: eshwag/NER
    def train1(self, data, y, tag):
        #tagged_data = a.fit(a.tag(),y,tag)
        # Features as conditional random field accepts
        feaobj = Features(data, self.num_features)
        x_train, y_train = feaobj.get
        print("labelled data")
        # Using conditional random field as features
        crf = CRF(algorithm='lbfgs',
                  c1=0.1,
                  c2=0.1,
                  max_iterations=100,
                  all_possible_transitions=False)
        print(crf)
        crf.fit(x_train, y_train)

        # Saving the model which is trained
        filename = 'finalized_model.sav'
        pickle.dump(crf, open(filename, 'wb'))

        # Prediction on train
        pred = crf.predict(x_train)

        # printing classification report and Accuracy
        print('\n \n Prediction On Trained Data:\n \n',
              flat_classification_report(y_train, pred))
        print('Accuracy:', flat_accuracy_score(y_train, pred))
Beispiel #10
0
def train(train_file, test_file, min_freq, model_file):
    '''Train a CRF tagger based'''
    # Read in initial training data
    conll_data_train = read_conll_data(train_file)
    train_sents = [[line[0] for line in doc] for doc in conll_data_train]
    train_labels = [[line[2] for line in doc] for doc in conll_data_train]

    # Featurize and create instance from list of sentences
    feat_sent_train = build_dataset(train_sents)
    print("Training on {0} inst".format(len(feat_sent_train)))

    # Train and test loop for parameter settings
    # Create and train CRF model
    # For different parameter options, see:
    # https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html
    model = CRF(min_freq=min_freq)
    model.fit(feat_sent_train, train_labels)

    # Test the model on held out test set if wanted
    if args.test_file:
        conll_data_test = read_conll_data(test_file)
        test_sents = [[line[0] for line in doc] for doc in conll_data_test]
        test_labels = [[line[2] for line in doc] for doc in conll_data_test]
        feat_sent_test = build_dataset(test_sents)
        # Predicting and printing accuracy
        pred = model.predict(feat_sent_test)
        acc = metrics.flat_accuracy_score(test_labels, pred)
        print("Accuracy: {0}%".format(float(round(acc, 3)) * 100))
    # Save model to disk if wanted
    if args.model:
        print("Saving model to {0}".format(model_file))
        joblib.dump(model, model_file)
Beispiel #11
0
def main(path_train, path_test, path_pred, path_crf, take_first, dev_size):
    print("loading train corpus..")
    _, X_raw, y = load_corpus(path_train, take_first=take_first)
    print("extracting features from train corpus..")
    fe = TaggerFeatureExtractor()
    X = fe.fit_transform(tqdm(X_raw))
    print("training..")
    crf = CRF(algorithm='ap', verbose=True, max_iterations=10)
    if dev_size:
        X, X_dev, y, y_dev = train_test_split(X, y, test_size=dev_size)
    else:
        X_dev, y_dev = None, None
    crf.fit(X, y, X_dev, y_dev)

    print("saving..")
    joblib.dump({'fe': fe, 'crf': crf}, path_crf, compress=2)

    print("loading test corpus..")
    corpus, X_test_raw, y_test = load_corpus(path_test)
    print("extracting features from test corpus..")
    X_test = fe.transform(X_test_raw)
    print("predicting..")
    y_pred = crf.predict(tqdm(X_test))

    print("saving results..")
    sents_pred = y_pred_to_sents_pred(corpus, y_pred)
    conll.write_sents(sents_pred, path_pred)
def train_crf(labelled_files, save=True, eval=True):
    x, y, _ = format_labelled_data(labelled_files)

    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=False)

    if eval:
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)
        crf.fit(x_train, y_train)
        pred = crf.predict(x_test)
        report = classification_report(y_test, pred)
        print("Test Results:\n")
        line(60)
        print(report)
        line(60)
        log_results(y_test, pred)
        line(60)

    else:
        crf.fit(x, y)

    if save:
        save_crf(crf)

    return crf
Beispiel #13
0
class CRFBased:
    '''CRF based information retrieval.
    The model is similar to the Default model
    used in homework 2 and 3'''
    def __init__(self, load, n_train, n_test):
        self.load = load
        self.crf = CRF(algorithm='lbfgs',
                       c1=0.1,
                       c2=0.1,
                       max_iterations=100,
                       all_possible_transitions=False)
        self.n_train = int(n_train)
        self.n_test = int(n_test)

    def load_data(self):
        X_train, y_train, X_test, y_test = prepare_crf_dataset(
            self.load, self.n_train, self.n_test)
        return X_train, y_train, X_test, y_test

    def fit(self, X, y):
        self.crf.fit(X, y)
        self.labels = list(self.crf.classes_)
        self.labels.remove('O')

    def predict(self, X):  #dataset = Train or Test
        pred = self.crf.predict(X)
        return pred

    def evaluate(self, y_true, y_pred):
        print("Final Scores for CRF Based Modes:")
        print(
            flat_classification_report(y_pred=y_pred,
                                       y_true=y_true,
                                       labels=self.labels))
Beispiel #14
0
    def train(self, model_name, tagged_sentences):
        # Split the dataset for training and testing
        cutoff = int(.75 * len(tagged_sentences))
        training_sentences = tagged_sentences[:cutoff]
        test_sentences = tagged_sentences[cutoff:]

        X_train, y_train = transform_to_dataset(training_sentences)
        X_test, y_test = transform_to_dataset(test_sentences)
        print(len(X_train))
        print(len(X_test))


        print("Training Started........")
        print("it will take time according to your dataset size..")
        model = CRF()
        model.fit(X_train, y_train)
        print("Training Finished!")
        
        print("Evaluating with Test Data...")
        y_pred = model.predict(X_test)
        print("Accuracy is: ")
        print(metrics.flat_accuracy_score(y_test, y_pred))
        
        pickle.dump(model, open(model_name, 'wb'))
        print("Model Saved!")
Beispiel #15
0
def test_crf(train_file,test_file,model_name=""):
    valores = []
    data=pandas.read_csv(train_file,sep="\t",header=None)
    X_dataset=fromListToTuple(data.iloc[:,[0,1,2,3]].values)
    useful_features=[True,True]
    X_train,y_train=prepareData([X_dataset],'train',useful_features)
    data2=pandas.read_csv(test_file,sep="\t",header=None)
    X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values)
    X_teste,y_teste=prepareData([X_test],'predict',useful_features)
    crf = CRF(
            algorithm='lbfgs',
            c1=0.0625,
            c2=0.5,
            max_iterations=100,
            all_possible_transitions=False,
            all_possible_states=True,
            verbose=True
        )
    crf.fit(X_train, y_train)
    if(model_name!=""):
        save_model(model_name + ".pickle",crf)
    useful_features=[True,True]
    data2=pandas.read_csv(test_file,sep="\t",header=None)
    X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values)
    X_teste,y_teste=prepareData([X_test],'predict',useful_features)
    y_pred=crf.predict(X_teste)
    resultados = []
    for index,elem in enumerate(y_pred[0]):
        resultados.append(str(y_pred[0][index]))
    return resultados
Beispiel #16
0
def write_to_CoNLL(mdl_file_name, sentence2features, test_sentences, write_path):
    X_test_local = []
    cond_rand_mdl = CRF(algorithm='lbfgs',
                        c1=0.0001,
                        c2=0.0001,
                        max_iterations=100,
                        all_possible_transitions=False,
                        model_filename=mdl_file_name)
    if mdl_file_name[(len(mdl_file_name) - 1)] == '2':
        old_crf = CRF(algorithm='lbfgs',
                      c1=0.0001,
                      c2=0.0001,
                      max_iterations=100,
                      all_possible_transitions=False,
                      model_filename=(mdl_file_name[:(len(mdl_file_name) - 1)]) + '1')
        X_test_local = [sent2features_second_guess(s, sentence2features, old_crf) for s in test_sentences]
    else:
        X_test_local = [sentence2features(s) for s in test_sentences]
    predictions = cond_rand_mdl.predict(X_test_local)
    with open(write_path, 'a') as f:
        for i in range(0, len(predictions)):
            sent = test_sentences[i]
            preds = predictions[i]
            for j in range(0, len(sent)):
                str_to_write = '{}\t{}\n'.format(sent[j][0], preds[j])
                f.write(str_to_write)
            f.write('\n')
Beispiel #17
0
def train(file_path: str, model_path: str = None):
    """
    Training CRF model from a given ``file_path``
    """
    addresses = read_file(file_path)
    addresses_train, addresses_val = train_test_split(addresses,
                                                      test_size=0.25,
                                                      random_state=42)

    X_train, y_train = addresses_to_features(addresses_train)
    X_val, y_val = addresses_to_features(addresses_val)

    crf = CRF(c1=0.2,
              c2=0.2,
              max_iterations=100,
              all_possible_transitions=True)
    crf.fit(X_train, y_train)

    # prediction score on validation set
    y_pred = crf.predict(X_val)
    f1_score = metrics.flat_f1_score(y_val,
                                     y_pred,
                                     average="weighted",
                                     labels=[l for l in LABELS if l != "O"])
    print("Flat F1-Score on validation set = {}".format(f1_score))

    if model_path:
        joblib.dump(crf, model_path)
        print("Save model to {}".format(model_path))

    return crf
Beispiel #18
0
def test_crf(xseq, yseq, algorithm):
    crf = CRF(algorithm=algorithm)
    crf.fit([xseq], [yseq])

    y_pred = crf.predict([xseq])
    if algorithm != 'ap':  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]
Beispiel #19
0
def test_crf_verbose(xseq, yseq, algorithm, use_dev):
    crf = CRF(algorithm=algorithm, verbose=True)

    if use_dev:
        X_dev, y_dev = [xseq], [yseq]
    else:
        X_dev, y_dev = None, None

    crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev)
    y_pred = crf.predict([xseq])
    if algorithm != 'ap':  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]
Beispiel #20
0
def test_crf_verbose(xseq, yseq, algorithm, use_dev):
    crf = CRF(algorithm, verbose=True)

    if use_dev:
        X_dev, y_dev = [xseq], [yseq]
    else:
        X_dev, y_dev = None, None

    crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev)
    y_pred = crf.predict([xseq])
    if algorithm != "ap":  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]
Beispiel #21
0
class CRFmodel(ModelBase):
    def __init__(self):
        self.model = CRF(algorithm='lbfgs',
                         c1=0.1,
                         c2=0.1,
                         max_iterations=500,
                         all_possible_transitions=True)

    def fit(self, X, Y):
        if self.debug:
            print("training CRF...")
        self.model.fit(X, Y)

    def predict(self, X):
        return self.model.predict(X)
Beispiel #22
0
class CRFEvaluateStep(Step):
    """
    Step to evaluate testing data against a CRF model,
    stored on file
    """
    def __init__(self, model_file_path):
        self.model_file_path = path.abspath(path.expanduser(model_file_path))
        self.model = CRF(algorithm='l2sgd',
                         c2=0.1,
                         max_iterations=1000,
                         all_possible_transitions=True,
                         model_filename=self.model_file_path)

    def run(self, batches: Generator) -> None:
        """
        Runs the CRF model, storing to pickle in the end
        """
        st = time.time()

        x = []
        y = []

        # For prediction, CRF does not implement batching, so we pass a list
        for batch in batches:
            b = list(batch)
            x.extend(b[0])
            y.extend(b[1])

        accuracy = self.model.score(x, y)
        y_pred = self.model.predict(x)
        f1_score = metrics.flat_f1_score(y, y_pred, average='weighted')
        accuracy_sentence = metrics.sequence_accuracy_score(y, y_pred)
        classification_report = metrics.flat_classification_report(
            y, y_pred, labels=self.model.classes_)
        print("*" * 80)
        print("MODEL EVALUATION")
        print("*" * 80)
        print("Token-wise accuracy score on Test Data:")
        print(round(accuracy, 3))
        print("F1 score on Test Data:")
        print(round(f1_score, 3))
        print(
            "Sequence accurancy score (% of sentences scored 100% correctly):")
        print(round(accuracy_sentence, 3))
        print("Class-wise classification report:")
        print(classification_report)
        et = time.time()
        print(f"Evaluation finished in {round(et-st, 2)} seconds.")
Beispiel #23
0
def main():
    X, y = load_dataset(DATA_PATH)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=False)
    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_test)
    f1_score = flat_f1_score(y_test, y_pred, average='weighted')
    print(f1_score)

    report = flat_classification_report(y_test, y_pred)
    print(report)
Beispiel #24
0
def test_model(
    model: sklearn_crfsuite.CRF,
    test_path: typing.Union[str, Path],
    out_file: typing.Optional[typing.TextIO] = None,
):
    """Print an accuracy report for a model to a file"""
    try:
        import conllu
    except ImportError as e:
        _LOGGER.fatal("conllu package is required for testing")
        _LOGGER.fatal("pip install 'conllu>=4.4'")
        raise e

    _LOGGER.debug("Loading test file (%s)", test_path)
    with open(test_path, "r") as test_file:
        test_sents = conllu.parse(test_file.read())

    _LOGGER.debug("Getting features for %s test sentence(s)", len(test_sents))
    x_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    labels = list(model.classes_)

    y_pred = model.predict(x_test)
    print(
        "F1 score on the test set = {}".format(
            metrics.flat_f1_score(y_test,
                                  y_pred,
                                  average="weighted",
                                  labels=labels)),
        file=out_file,
    )
    print(
        "Accuracy on the test set = {}".format(
            metrics.flat_accuracy_score(y_test, y_pred)),
        file=out_file,
    )

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(
        "Test set classification report: {}".format(
            metrics.flat_classification_report(y_test,
                                               y_pred,
                                               labels=sorted_labels,
                                               digits=3)),
        file=out_file,
    )
class CRFModel(object):
    def __init__(self,algorithm='lbfgs',
                 c1=0.1,
                 c2=0.1,
                 max_iterations=100,
                 all_possible_transitions=False):
        self.model=CRF(algorithm='lbfgs',
                 c1=0.1,
                 c2=0.1,
                 max_iterations=100,
                 all_possible_transitions=False)

    def train(self,sentences,tagLists):
        features=[utils.sent2feature(sent)for sent in sentences]
        self.model.fit(features,tagLists)
    def test(self,sentences):
        features=[utils.sent2feature(sent)for sent in sentences]
        predictLists=self.model.predict(features)
        return predictLists
Beispiel #26
0
def train_crf_pos(corpus, corpus_name):

    # Required corpus structure:
    # [[(w1,t1), (w2,t2),...(wn,tn)], [(w1,t1)(w2,t2),...(wm,tm)],...]

    #feat_all = {} # common features (baseline set)
    #feat_en = {} # extra features for English
    #features = {**feat_all, **feat_en}
    train_frac = 0.9  # fraction of data for the training set
    split_idx = int(train_frac * len(corpus))

    # Extract the feautures and separate labels from features
    X = [get_crf_features([pair[0] for pair in sent]) for sent in corpus]
    y = [[pair[1] for pair in sent] for sent in corpus]

    # Create the training and the test sets
    X_train = X[:split_idx]
    y_train = y[:split_idx]
    X_test = X[split_idx:]
    y_test = y[split_idx:]

    # Create the CRF model
    model = CRF(
        algorithm='lbfgs',  # gradient descent using the L-BFGS method
        c1=0.1,  # coeff. for L1 regularization
        c2=0.1,  # coeff. for L2 regularization
        max_iterations=100,
    )

    # Train the model
    model.fit(X_train, y_train)

    # Save the model
    with open(os.path.join('data', 'models', corpus_name + '_crf.pkl'),
              'wb') as f:
        pickle.dump(model, f, 4)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print("Test accuracy: %.4f" % metrics.flat_accuracy_score(y_test, y_pred))

    return model
Beispiel #27
0
def test_crf(train_file, test_file, model_name=""):
    l1 = [0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1]
    l2 = [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16]
    valores = []
    useful_features = [True, True]
    data = pandas.read_csv(train_file, sep="\t", header=None)
    X_dataset = fromListToTuple(data.iloc[:, [0, 1, 2, 3]].values)
    X_teste, y_teste = prepareData([X_dataset], 'test', useful_features)
    #X_teste, test = train_test_split(X_dataset, test_size=0.1)
    X_teste = pd.DataFrame(X_teste).transpose()
    y_teste = pd.DataFrame(y_teste).transpose()

    print(X_teste.shape)
    print(y_teste.shape)
    X_teste_2 = X_teste
    y_teste_2 = y_teste
    crf = CRF(
        algorithm='lbfgs',
        #c1=0.0625,
        c1=1.0,
        #c2=0.5,
        c2=1.0,
        max_iterations=100,
        all_possible_transitions=False,
        all_possible_states=True,
        verbose=True)
    crf.fit(X_teste.values.tolist(), y_teste.values.tolist())
    y_pred = crf.predict(X_teste.values.tolist())
    labels = list(crf.classes_)
    save_model("NP_Final_Macro.pickle", crf)
    string = " "
    string += str(
        metrics.flat_classification_report(y_teste.values.tolist(),
                                           y_pred,
                                           labels=labels,
                                           digits=3))
    filename = "Results _.txt"
    print("$$$$$$$$$$$")
    print(filename)
    print("$$$$$$$$$$$")
    with open(filename, 'a') as f:
        f.write(string)
Beispiel #28
0
def train_seq(X_train, Y_train, X_dev, Y_dev):
    # crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True)
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10,
              max_iterations=50)  #, all_possible_states=True)
    #Just to fit on training data
    crf.fit(X_train, Y_train)
    labels = list(crf.classes_)
    #testing:
    y_pred = crf.predict(X_dev)
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(
        metrics.flat_f1_score(Y_dev, y_pred, average='weighted',
                              labels=labels))
    print(
        metrics.flat_classification_report(Y_dev,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))
    print(metrics.sequence_accuracy_score(Y_dev, y_pred))
    get_confusion_matrix(Y_dev, y_pred, labels=sorted_labels)
Beispiel #29
0
class CRFModel(object):
    def __init__(self,
                 solver="lbfgs",
                 c1=0.1,
                 c2=0.1,
                 max_iter=100,
                 all_possible_transitions=False):
        self.model = CRF(algorithm=solver,
                         c1=c1,
                         c2=c2,
                         max_iterations=max_iter,
                         all_possible_transitions=all_possible_transitions)

    def train(self, sentences, tag_lists):
        features = [sent2features(s) for s in sentences]
        self.model.fit(features, tag_lists)

    def test(self, sentences):
        features = [sent2features(s) for s in sentences]
        pred_tag_lists = self.model.predict(features)
        return pred_tag_lists
Beispiel #30
0
class TrainCRF():
    def __init__(self,
                 char2idx_path,
                 tag2idx_path,
                 algorithm='lbfgs',
                 c1=0.1,
                 c2=0.1,
                 max_iterations=100,
                 all_possible_transitions=False):

        # 载入一些字典
        # char2idx: 字 转换为 token
        self.char2idx = load_dict(char2idx_path)
        # tag2idx: 标签转换为 token
        self.tag2idx = load_dict(tag2idx_path)
        # idx2tag: token转换为标签
        self.idx2tag = {v: k for k, v in self.tag2idx.items()}
        # 初始化隐状态数量(实体标签数)和观测数量(字数)
        self.tag_size = len(self.tag2idx)
        self.vocab_size = max([v for _, v in self.char2idx.items()]) + 1
        self.model = CRF(algorithm=algorithm,
                         c1=c1,
                         c2=c2,
                         max_iterations=max_iterations,
                         all_possible_transitions=all_possible_transitions)

    def train_crf(self, train_dic_path):
        train_dic = load_data(train_dic_path)
        features = []
        labels = []
        for dic in tqdm(train_dic):
            features.append(sent2features(dic["text"]))
            labels.append(dic["label"])

        self.model.fit(features, labels)

    def predict(self, setence):
        features = [sent2features(s) for s in setence]
        pred_tag_lists = self.model.predict(features)
        print(pred_tag_lists)
Beispiel #31
0
class GenericRetriever(Retriever):
    def learn(self, config):
        texts = config.getTexts()

        sentences = []
        for text in texts:
            for sent in ET.fromstring(text).findall('sentence'):
                stemp = []
                for wrd in sent.findall('word'):
                    stemp.append(
                        [wrd.text, wrd.attrib['pos'], wrd.attrib['tag']])
                sentences.append(stemp)

        X = [sent2features(s) for s in sentences]
        y = [sent2labels(s) for s in sentences]

        self.clf = CRF(algorithm='lbfgs',
                       c1=10,
                       c2=0.1,
                       max_iterations=100,
                       all_possible_transitions=False)
        self.clf.fit(X, y)

    def retrieve(self, text):
        text = nltk.pos_tag(nltk.word_tokenize(text.lower()))
        X = sent2features(text)
        resp = []
        pred = self.clf.predict([X])[0]
        acum = None
        for i in range(len(pred)):

            if pred[i][0] == 'B':
                acum = text[i][0]
            elif pred[i][0] == 'I':
                acum = acum + " " + text[i][0]
            else:
                if acum != None: resp.append(acum)
                acum = None

        return resp
Beispiel #32
0
class CRFPredictStep(Step):
    """
    Step to get predictions from features using a CRF model, for specific sentences.
    """
    def __init__(self, model_file_path):
        self.model_file_path = path.abspath(path.expanduser(model_file_path))
        self.model = CRF(algorithm='l2sgd',
                         c2=0.1,
                         max_iterations=1000,
                         all_possible_transitions=True,
                         model_filename=self.model_file_path)

    def run(self, batches: Generator) -> None:
        """
        Runs the step
        """
        features = list(batches)
        pred = self.model.predict(features)
        for index, feature in enumerate(features):
            print(' '.join(map(lambda x: x['word'], feature)), end='')
            print(' => ', end='')
            print(pred[index])
Beispiel #33
0
def test_predict_without_fit(xseq, algorithm):
    crf = CRF(algorithm)
    with pytest.raises(Exception):
        crf.predict([xseq])