コード例 #1
0
def train(train_file):
    print(train_file)
    # load data
    train_set = []

    train_set += load_dataset(train_file)
    print("Load corpus from file", train_file)
    transformer = CustomTransformer(template)
    X, y = transformer.transform(train_set)

    # train
    crf_params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    model_path = join(dirname(__file__), "model", "model.bin")
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.01)
    estimator = CRF(params=crf_params, filename=model_path)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_dev)
    f1_score = metrics.flat_f1_score(y_dev, y_pred, average='weighted')
    print("Dev score: ", f1_score)
コード例 #2
0
ファイル: train.py プロジェクト: microvnn/language_vn
    def train(train_X, train_Y):
        from languageflow.model.crf import CRF

        logging.info("Step 3: Training CRF model")
        start = time.time()
        crf_params = {
            "c1": 0.1,  # coefficient for L1 penalty
            "c2": 1e-3,  # coefficient for L2 penalty
            "max_iterations": 5000,  #
            "feature.possible_transitions": True,
        }
        crf = CRF(params=crf_params, filename=model_path)
        crf.fit(train_X, train_Y)
        end = time.time()
        logging.info(to_time_message(start, end))
        return crf
コード例 #3
0
ファイル: train.py プロジェクト: davidtranno1/word_tokenize
def train(train_path, model_path):
    train_set = []

    train_set += load_dataset(train_path)
    print("Load data from file", train_path)
    transformer = CustomTransformer(template)
    X, y = transformer.transform(train_set)

    # train
    params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    folder = dirname(model_path)
    try:
        makedirs(folder)
    except:
        pass
    estimator = CRF(params=params, filename=model_path)
    estimator.fit(X, y)
コード例 #4
0
ファイル: train.py プロジェクト: duongkstn/word_tokenize
def train(train_path, model_path):
    train_set = []

    train_set += load_dataset(train_path)
    print("Load data from file", train_path)
    transformer = CustomTransformer(template)
    X, y = transformer.transform(train_set)

    # train
    params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    folder = dirname(model_path)
    try:
        makedirs(folder)
    except:
        pass
    estimator = CRF(params=params, filename=model_path)
    estimator.fit(X, y)
コード例 #5
0
        "T[-3][3]", "T[-2][3]", "T[-1][3]",
    ]
    transformer = TaggedTransformer(template)

    # flow.transform(transformer)
    X_train, y_train = transformer.transform(train_sentences)
    X_test, y_test = transformer.transform(test_sentences)


    # =========================================================================#
    #                               Models
    # =========================================================================#
    parameters = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    filename = join(dirname(__file__), 'model', 'crf.model')
    estimator = CRF(params=parameters, filename=filename)
    estimator.fit(X_train, y_train)


    # =========================================================================#
    #                              Evaluation
    # =========================================================================#
    y_pred = estimator.predict(X_test)
    f1_score = iob_score(y_test, y_pred)
    print(f1_score)
コード例 #6
0
if __name__ == '__main__':
    # load data
    train_set = []
    for f in ["train.txt", "dev.txt", "test.txt"]:
        file = join(dirname(dirname(dirname(__file__))), "data", "vlsp2016",
                    "corpus", f)
        train_set += load_dataset(file)

    # transformer
    transformer = CustomTransformer(template)
    X, y = transformer.transform(train_set)

    # train
    crf_params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    model_path = join(dirname(__file__), "final_model", "model.bin")
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.01)
    estimator = CRF(params=crf_params, filename=model_path)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_dev)
    f1_score = metrics.flat_f1_score(y_dev, y_pred, average='weighted')
    print("Dev score: ", f1_score)

    joblib.dump(transformer, "final_model/transformer.bin")
コード例 #7
0
ファイル: main.py プロジェクト: anhlbt/ner
        "T[-3][3]", "T[-2][3]", "T[-1][3]",
    ]
    transformer = TaggedTransformer(template)

    flow.transform(transformer)

    # =========================================================================#
    #                               Models
    # =========================================================================#
    crf_params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    flow.add_model(Model(CRF(params=crf_params), "CRF"))

    # =========================================================================#
    #                              Evaluation
    # =========================================================================#
    flow.add_score('f1_chunk')
    flow.add_score('accuracy_chunk')

    flow.set_validation(TrainTestSplitValidation(test_size=0.1))
    # flow.set_validation(TrainTestSplitValidation(test_size=0.3))

    flow.train()

    # flow.save_model("CRF", filename="ner_crf_20171006_template_2.model")
コード例 #8
0
from load_data import load_data
from models.custom_transformer import CustomTransformer
from models.features import template

if __name__ == '__main__':
    train_path = join(dirname(__file__), "data", "corpus", "train.txt")
    train_set = []
    train_set += load_data(train_path)
    print("Load data from file", train_path)
    transformer = CustomTransformer(template)
    X_train, y_train = transformer.transform(train_set)

    # train
    params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    model_path = join(dirname(__file__), "models", "model_crf.bin")
    folder = dirname(model_path)
    try:
        makedirs(folder)
    except:
        pass
    estimator = CRF(params=params, filename=model_path)
    estimator.fit(X_train, y_train)
    transformer_path = join(dirname(__file__), "models", "transformer.bin")
    joblib.dump(transformer, transformer_path)
コード例 #9
0
 def test_crf(self):
     crf = CRF()
     crf.fit(['x', 'y'], ['a', 'b'])