Esempio n. 1
0
    def __init__(self, data):
        #super(BiLSTM_CRF, self).__init__()
        print "build batched lstmcrf..."

        ## add two more label for downlayer lstm, use original label size for CRF
        #label_size = data.label_alphabet_size
        self.label_alphabet=data.label_alphabet
        self.word_alphabet=data.word_alphabet
        #self.label_alphabet_size += 2
        self.crf = CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=50,
            all_possible_transitions=True
        )
        self.reformulator = Reformulator(data)
        self.useReformulator = False
        self.loss_function = nn.NLLLoss()
        self.topk=50
        self.X_train=[]
        self.Y_train=[]
        self.tag_mask_list=[]
        self.instances=[]
        self.scores_refs=[]
        self.tag_mask=None
Esempio n. 2
0
def train(file_path: str):
    """
    Training CRF model from a given ``file_path``
    """
    addresses = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            addresses.append(obj)
    addresses_train, addresses_val = train_test_split(addresses,
                                                      test_size=0.25,
                                                      random_state=42)

    X_train, y_train = addresses_to_features(addresses_train)
    X_val, y_val = addresses_to_features(addresses_val)

    crf = CRF(c1=0.2,
              c2=0.2,
              max_iterations=100,
              all_possible_transitions=True)
    crf.fit(X_train, y_train)

    # prediction score on validation set
    y_pred = crf.predict(X_val)
    metrics.flat_f1_score(y_val,
                          y_pred,
                          average='weighted',
                          labels=[l for l in LABELS if l != 'O'])
    return crf
Esempio n. 3
0
class CRFModel(object):
    def __init__(self,
                 algorithm='lbfgs',
                 c1=0.1,
                 c2=0.1,
                 max_iterations=100,
                 all_possible_transitions=False):

        self.model = CRF(algorithm=algorithm,
                         c1=c1,
                         c2=c2,
                         max_iterations=max_iterations,
                         all_possible_transitions=all_possible_transitions)

    def train(self, sentences, tag_lists, tagged=False):
        if tagged:
            features = [sent2features_tagged(s) for s in sentences]
        else:
            features = [sent2features(s) for s in sentences]
        self.model.fit(features, tag_lists)

    def test(self, sentences, tagged=False):
        if tagged:
            features = [sent2features_tagged(s) for s in sentences]
            pred_tag_lists = self.model.predict(features)
        else:
            features = [sent2features(s) for s in sentences]
            pred_tag_lists = self.model.predict(features)
        return pred_tag_lists
def parameter_tuning(args, dataset):
    c1s = experiment_util.get_param_list(args.c1)
    c2s = experiment_util.get_param_list(args.c2)
    best_valid_f1_score = -np.inf
    best_c1 = -np.inf
    best_c2 = -np.inf
    best_model = None
    for c1 in c1s:
        for c2 in c2s:
            crf = CRF(algorithm='lbfgs',
                      c1=c1,
                      c2=c2,
                      max_iterations=500,
                      all_possible_transitions=True,
                      verbose=args.debug)
            crf.fit(dataset.training.list_of_feature_dicts,
                    dataset.training.list_of_labels)
            preds = crf.predict(dataset.validation.list_of_feature_dicts)
            valid_f1_score = metrics.flat_f1_score(
                dataset.validation.list_of_labels, preds, average='micro')
            if valid_f1_score > best_valid_f1_score:
                best_valid_f1_score = valid_f1_score
                best_c1 = c1
                best_c2 = c2
                best_model = crf
    print('Best validation F1 score:', best_valid_f1_score, 'Best c1:',
          best_c1, 'Best c2:', best_c2)
    return best_model
Esempio n. 5
0
    def fit(self, train_data: Iterable[str], labels: Iterable[Iterable[str]]):
        """

        :param train_data:
        :param labels: labels in BIO or BILOU notation
        :return:
        """

        crf_dataset = self.__create_dataset(train_data, labels)

        features = [
            self.__convert_idata_to_features(message_data)
            for message_data in crf_dataset
        ]

        labels = [
            self.__extract_labels_from_data(message_data)
            for message_data in crf_dataset
        ]

        self.__crf_model = CRF(
            algorithm='lbfgs',
            c1=self.__CONFIG['L1_c'],
            c2=self.__CONFIG['L2_c'],
            max_iterations=self.__CONFIG['max_iterations'],
            all_possible_transitions=True,
        )

        self.__crf_model.fit(features, labels)

        return self
Esempio n. 6
0
def test_crf(train_file,test_file,model_name=""):
    valores = []
    data=pandas.read_csv(train_file,sep="\t",header=None)
    X_dataset=fromListToTuple(data.iloc[:,[0,1,2,3]].values)
    useful_features=[True,True]
    X_train,y_train=prepareData([X_dataset],'train',useful_features)
    data2=pandas.read_csv(test_file,sep="\t",header=None)
    X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values)
    X_teste,y_teste=prepareData([X_test],'predict',useful_features)
    crf = CRF(
            algorithm='lbfgs',
            c1=0.0625,
            c2=0.5,
            max_iterations=100,
            all_possible_transitions=False,
            all_possible_states=True,
            verbose=True
        )
    crf.fit(X_train, y_train)
    if(model_name!=""):
        save_model(model_name + ".pickle",crf)
    useful_features=[True,True]
    data2=pandas.read_csv(test_file,sep="\t",header=None)
    X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values)
    X_teste,y_teste=prepareData([X_test],'predict',useful_features)
    y_pred=crf.predict(X_teste)
    resultados = []
    for index,elem in enumerate(y_pred[0]):
        resultados.append(str(y_pred[0][index]))
    return resultados
Esempio n. 7
0
def train(train_file, test_file, min_freq, model_file):
    '''Train a CRF tagger based'''
    # Read in initial training data
    conll_data_train = read_conll_data(train_file)
    train_sents = [[line[0] for line in doc] for doc in conll_data_train]
    train_labels = [[line[2] for line in doc] for doc in conll_data_train]

    # Featurize and create instance from list of sentences
    feat_sent_train = build_dataset(train_sents)
    print("Training on {0} inst".format(len(feat_sent_train)))

    # Train and test loop for parameter settings
    # Create and train CRF model
    # For different parameter options, see:
    # https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html
    model = CRF(min_freq=min_freq)
    model.fit(feat_sent_train, train_labels)

    # Test the model on held out test set if wanted
    if args.test_file:
        conll_data_test = read_conll_data(test_file)
        test_sents = [[line[0] for line in doc] for doc in conll_data_test]
        test_labels = [[line[2] for line in doc] for doc in conll_data_test]
        feat_sent_test = build_dataset(test_sents)
        # Predicting and printing accuracy
        pred = model.predict(feat_sent_test)
        acc = metrics.flat_accuracy_score(test_labels, pred)
        print("Accuracy: {0}%".format(float(round(acc, 3)) * 100))
    # Save model to disk if wanted
    if args.model:
        print("Saving model to {0}".format(model_file))
        joblib.dump(model, model_file)
Esempio n. 8
0
def test_crf(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    y_pred = crf.predict([xseq])
    if algorithm != "ap":  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]
Esempio n. 9
0
    def __init__(self, data):

        print("build batched lstmcrf...")

        self.label_alphabet=data.label_alphabet
        self.word_alphabet=data.word_alphabet

        self.crf = CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_states=False,
            all_possible_transitions=True
        )
        self.examiner = Examiner(data)
        self.useExaminer = False
        self.loss_function = nn.NLLLoss()
        self.topk=5
        self.X_train=[]
        self.Y_train=[]
        self.pos_mask_list=[]
        self.instances=[]
        self.scores_refs=[]
        self.pos_mask=None
        self.tag_size=data.label_alphabet_size
Esempio n. 10
0
class CRFNER(object):
    """ A class to get reviews for products on Amazon """
    def __init__(self, gazetteer, fraction=0.7):
        self.gazateer = gazetteer
        self.fraction = fraction

    def train(self, documents):
        self.data = ner_processing.NERFormatter(self.gazateer, documents)
        d_train, d_test = ner_processing.train_test_NER(self.data)

        self.X_train, self.X_test, self.y_train, self.y_test = crf_processing.feature_extraction(
            d_train, d_test)

        self.model = CRF(algorithm='lbfgs',
                         c1=0.31,
                         c2=0.02,
                         max_iterations=100,
                         all_possible_transitions=True)

        self.model.fit(self.X_train, self.y_train)

    def predict(self, sentence):
        """Transforms a single sentence (for NER testing) into a CRF-suite format"""

        sentence_split = nltk.word_tokenize(sentence)
        n_words = [0] * len(sentence_split)

        df_pred = pd.DataFrame({
            'word':
            sentence_split,
            'sentence_no':
            n_words,
            'category':
            n_words,
            'POS': [x[-1] for x in nltk.pos_tag(sentence_split)],
        })

        getter = crf_processing.SentenceGetter(df_pred)
        sent = getter.get_next()
        sentences = getter.sentences

        self.X = [crf_processing.sent2features(s) for s in sentences]
        return self.model.predict(self.X)

    def report(self):
        labels = list(self.model.classes_)

        y_pred = self.model.predict(self.X_test)
        print('F1 score {}'.format(
            metrics.flat_f1_score(self.y_test,
                                  y_pred,
                                  average='weighted',
                                  labels=labels)))

        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
        print(
            metrics.flat_classification_report(self.y_test,
                                               y_pred,
                                               labels=sorted_labels,
                                               digits=3))
Esempio n. 11
0
def train(file_path: str, model_path: str = None):
    """
    Training CRF model from a given ``file_path``
    """
    addresses = read_file(file_path)
    addresses_train, addresses_val = train_test_split(addresses,
                                                      test_size=0.25,
                                                      random_state=42)

    X_train, y_train = addresses_to_features(addresses_train)
    X_val, y_val = addresses_to_features(addresses_val)

    crf = CRF(c1=0.2,
              c2=0.2,
              max_iterations=100,
              all_possible_transitions=True)
    crf.fit(X_train, y_train)

    # prediction score on validation set
    y_pred = crf.predict(X_val)
    f1_score = metrics.flat_f1_score(y_val,
                                     y_pred,
                                     average="weighted",
                                     labels=[l for l in LABELS if l != "O"])
    print("Flat F1-Score on validation set = {}".format(f1_score))

    if model_path:
        joblib.dump(crf, model_path)
        print("Save model to {}".format(model_path))

    return crf
Esempio n. 12
0
def test_attributes(xseq, yseq):
    crf = CRF()
    assert crf.tagger_ is None
    assert crf.size_ is None
    assert crf.classes_ is None
    assert crf.num_attributes_ is None
    assert crf.attributes_ is None
    assert crf.state_features_ is None
    assert crf.transition_features_ is None

    crf.fit([xseq] * 20, [yseq] * 20)

    assert crf.tagger_ is not None
    assert crf.size_ > 1000
    assert set(crf.classes_) == {"sunny", "rainy"}

    assert crf.num_attributes_ > 0
    assert len(crf.attributes_) == crf.num_attributes_
    assert all(crf.attributes_)
    assert "clean" in crf.attributes_

    assert len(crf.state_features_) > 0
    assert all(isinstance(c, float) for c in crf.state_features_.values())
    assert all(
        attr in crf.attributes_ and label in crf.classes_ for (attr, label) in crf.state_features_.keys()
    ), crf.state_features_

    assert len(crf.transition_features_) > 0
    assert all(isinstance(c, float) for c in crf.transition_features_.values())
    assert all(
        label_from in crf.classes_ and label_to in crf.classes_
        for (label_from, label_to) in crf.transition_features_.keys()
    ), crf.transition_features_
def main():

    df=pd.read_csv(args.input)
    tagged_sentence=Preparing_tagged_data(df)
    df=df[['ID','FORM','XPOSTAG']]
    #printing details
    printing_details(tagged_sentence)
    
    train_set, test_set = train_test_split(tagged_sentence,test_size=0.05,random_state=7)
    
    #print("Number of Sentences in Training Data ",len(train_set))
    #print("Number of Sentences in Testing Data ",len(test_set))
    X_train,y_train=prepareData(tagged_sentence)
    X_test,y_test=prepareData(test_set)
    
    crf = CRF(
    algorithm='l2sgd',
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True)
    
    crf.fit(X_train, y_train)
    print(crf)
    
    print("Saving Model .....")
    # Save the Model to file in the current working directory
    Pkl_Filename = args.output
    with open(Pkl_Filename, 'wb') as file:
        pickle.dump(crf, file)
        
    print("Model Saved at "+ Pkl_Filename)
    print()    
    print("Checking the Algoritham's Performance \n")
    TestData(crf, X_train,y_train,X_test,y_test)
Esempio n. 14
0
    def train_pos_tagger(self, path):
        # Just to make sure
        nltk.download('treebank')

        tagged_sentences = treebank.tagged_sents()

        train_size = int(.80 * len(tagged_sentences))
        training_sentences = tagged_sentences[:train_size]

        X_train, y_train = self.transform_to_dataset(training_sentences)

        model = CRF()

        print('Training started...')
        model.fit(X_train, y_train)
        print('Training finished.')

        # Save classifier to file
        model_pkl = open(path, 'wb')
        pickle.dump(model, model_pkl)
        model_pkl.close()

        print("POSTagger saved.")

        self.classifier = model
Esempio n. 15
0
def write_to_CoNLL(mdl_file_name, sentence2features, test_sentences, write_path):
    X_test_local = []
    cond_rand_mdl = CRF(algorithm='lbfgs',
                        c1=0.0001,
                        c2=0.0001,
                        max_iterations=100,
                        all_possible_transitions=False,
                        model_filename=mdl_file_name)
    if mdl_file_name[(len(mdl_file_name) - 1)] == '2':
        old_crf = CRF(algorithm='lbfgs',
                      c1=0.0001,
                      c2=0.0001,
                      max_iterations=100,
                      all_possible_transitions=False,
                      model_filename=(mdl_file_name[:(len(mdl_file_name) - 1)]) + '1')
        X_test_local = [sent2features_second_guess(s, sentence2features, old_crf) for s in test_sentences]
    else:
        X_test_local = [sentence2features(s) for s in test_sentences]
    predictions = cond_rand_mdl.predict(X_test_local)
    with open(write_path, 'a') as f:
        for i in range(0, len(predictions)):
            sent = test_sentences[i]
            preds = predictions[i]
            for j in range(0, len(sent)):
                str_to_write = '{}\t{}\n'.format(sent[j][0], preds[j])
                f.write(str_to_write)
            f.write('\n')
Esempio n. 16
0
def entity_crf_train(my_subjects):
    for i in range(0, len(X)):
        for j in range(0, len(X[i])):
            if 'sub' in X[i][j]:
                subj = my_subjects[np.random.randint(len(my_subjects))]
                subj = subj.split()
                X[i] = X[i][:j] + subj + X[i][j + 1:]
                y[i] = y[i][:j] + ['subject'] * len(subj) + y[i][j + 1:]
        X[i] = X[i][0:10]
        y[i] = y[i][0:10]

    crf = CRF(c1=0.1,
              c2=0.01,
              max_iterations=200,
              all_possible_transitions=True)

    print(".....Training entity extraction model.....")
    crf.fit(X, y)
    print(".....Trained entity extraction model.....")

    working_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    with open(working_directory + '/full_model/crf_model.pkl',
              'wb') as pickle_file:
        pickle.dump(crf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
    with open(working_directory + '/full_model/subjects.pkl',
              'wb') as pickle_file:
        pickle.dump(my_subjects, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 17
0
def test_attributes(xseq, yseq):
    crf = CRF()
    assert crf.tagger_ is None
    assert crf.size_ is None
    assert crf.classes_ is None
    assert crf.num_attributes_ is None
    assert crf.attributes_ is None
    assert crf.state_features_ is None
    assert crf.transition_features_ is None

    crf.fit([xseq] * 20, [yseq] * 20)

    assert crf.tagger_ is not None
    assert crf.size_ > 1000
    assert set(crf.classes_) == {'sunny', 'rainy'}

    assert crf.num_attributes_ > 0
    assert len(crf.attributes_) == crf.num_attributes_
    assert all(crf.attributes_)
    assert 'clean' in crf.attributes_

    assert len(crf.state_features_) > 0
    assert all(isinstance(c, float) for c in crf.state_features_.values())
    assert all(attr in crf.attributes_ and label in crf.classes_
               for (attr,
                    label) in crf.state_features_.keys()), crf.state_features_

    assert len(crf.transition_features_) > 0
    assert all(isinstance(c, float) for c in crf.transition_features_.values())
    assert all(label_from in crf.classes_ and label_to in crf.classes_ for (
        label_from,
        label_to) in crf.transition_features_.keys()), crf.transition_features_
Esempio n. 18
0
class CRFBased:
    '''CRF based information retrieval.
    The model is similar to the Default model
    used in homework 2 and 3'''
    def __init__(self, load, n_train, n_test):
        self.load = load
        self.crf = CRF(algorithm='lbfgs',
                       c1=0.1,
                       c2=0.1,
                       max_iterations=100,
                       all_possible_transitions=False)
        self.n_train = int(n_train)
        self.n_test = int(n_test)

    def load_data(self):
        X_train, y_train, X_test, y_test = prepare_crf_dataset(
            self.load, self.n_train, self.n_test)
        return X_train, y_train, X_test, y_test

    def fit(self, X, y):
        self.crf.fit(X, y)
        self.labels = list(self.crf.classes_)
        self.labels.remove('O')

    def predict(self, X):  #dataset = Train or Test
        pred = self.crf.predict(X)
        return pred

    def evaluate(self, y_true, y_pred):
        print("Final Scores for CRF Based Modes:")
        print(
            flat_classification_report(y_pred=y_pred,
                                       y_true=y_true,
                                       labels=self.labels))
Esempio n. 19
0
    def build(sequences, labels, **kwargs):
        """
        Builds a sequence classifier from x/y pairs

        :param sequences: A list of sequences, with each member of the sequence
                   represented as features
        :type sequences: list of list of dict
        :param labels: The corresponding labels for each sequence
        :type labels: list of list of str
        :param kwargs: arguments to override the defaults given to the
                       underlying CRF
        :return: A trained sequence classifier based on the provided training
                 data
        :rtype: SequenceClassifier
        """
        params = {
            'algorithm': DEFAULT_ALGORITHM,
            'c1': DEFAULT_C1,
            'c2': DEFAULT_C2,
            'max_iterations': DEFAULT_MAX_ITERATIONS,
            'all_possible_transitions': DEFAULT_ALL_POSSIBLE_TRANSITIONS
        }

        if kwargs:
            params.update(kwargs)

        model = CRF(**params)
        model.fit(sequences, labels)
        return SequenceClassifier(model)
Esempio n. 20
0
def test_sklearn_crfsuite(xseq, yseq):
    crf = CRF(c1=0.0, c2=0.1, max_iterations=50)
    crf.fit([xseq], [yseq])

    expl = explain_weights(crf)
    text, html = format_as_all(expl, crf)

    assert "y='sunny' top features" in text
    assert "y='rainy' top features" in text
    assert "Transition features" in text
    assert "sunny   -0.130    0.696" in text
    assert u'+0.124  солнце:не светит' in text

    html_nospaces = html.replace(' ', '').replace("\n", '')
    assert u'солнце:не светит' in html
    assert '<th>rainy</th><th>sunny</th>' in html_nospaces

    try:
        from eli5 import format_as_dataframe, format_as_dataframes
    except ImportError:
        pass
    else:
        from .test_formatters_as_dataframe import check_targets_dataframe
        df_dict = format_as_dataframes(expl)
        check_targets_dataframe(df_dict['targets'], expl)
        df_transition = df_dict['transition_features']
        transition = expl.transition_features
        print(df_transition)
        assert list(transition.class_names) == ['rainy', 'sunny']
        assert np.isclose(df_transition['rainy']['rainy'], transition.coef[0,
                                                                           0])
        assert np.isclose(df_transition['sunny']['rainy'], transition.coef[0,
                                                                           1])
        assert np.isclose(df_transition['rainy']['sunny'], transition.coef[1,
                                                                           0])
Esempio n. 21
0
 def __init__(self, model_file_path):
     self.model_file_path = path.abspath(path.expanduser(model_file_path))
     self.model = CRF(algorithm='l2sgd',
                      c2=0.1,
                      max_iterations=1000,
                      all_possible_transitions=True,
                      model_filename=self.model_file_path)
Esempio n. 22
0
    def __init__(
        self,
        hyper_params: Dict[str, float] = None,
        model_path: str = None,
    ):
        if model_path:
            self.load_model(model_path=model_path)
        else:
            algorithm = (hyper_params["algorithm"] if hyper_params
                         and "algorithm" in hyper_params else "lbfgs")
            c1 = hyper_params[
                "c1"] if hyper_params and "c1" in hyper_params else 0.1
            c2 = hyper_params[
                "c2"] if hyper_params and "c2" in hyper_params else 0.1
            max_iters = (hyper_params["max_iterations"] if hyper_params
                         and "max_iterations" in hyper_params else 100)
            apt = (hyper_params["all_possible trainsitions"] if hyper_params
                   and "max_iterations" in hyper_params else True)

            self.fe = FeatureExtractor()

            self.crf = CRF(
                algorithm=algorithm,
                c1=c1,
                c2=c2,
                max_iterations=max_iters,
                all_possible_transitions=apt,
            )
Esempio n. 23
0
    def train(self, inputfile, features_names_list, annotation_column):
        """
        This function fits a classification model as specified on training data

        :param inputfile: path to inputfile containing the training data
        :param features_names_list: list of indications of all feature columns that should be used
        :param annotation_column: indication of column with annotations
        :type inputfile: string
        :type features_names_list: list
        :type annotation_column: string
        """

        # initialize the right model
        if self.modelname == 'logreg':
            self.model = LogisticRegression()

        elif self.modelname == 'naivebayes':
            self.model = BernoulliNB()

        elif self.modelname == 'svm':
            self.model = LinearSVC()

        elif self.modelname == 'crf':
            self.model = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)

        # store features_names_list as class attribute
        self.features_names_list = features_names_list

        # get training features and labels
        train_features = self.get_features(inputfile)
        train_targets = self.get_labels(inputfile, annotation_column)

        # fit the model
        self.model.fit(train_features, train_targets)
Esempio n. 24
0
File: CRF.py Progetto: eshwag/NER
    def train1(self, data, y, tag):
        #tagged_data = a.fit(a.tag(),y,tag)
        # Features as conditional random field accepts
        feaobj = Features(data, self.num_features)
        x_train, y_train = feaobj.get
        print("labelled data")
        # Using conditional random field as features
        crf = CRF(algorithm='lbfgs',
                  c1=0.1,
                  c2=0.1,
                  max_iterations=100,
                  all_possible_transitions=False)
        print(crf)
        crf.fit(x_train, y_train)

        # Saving the model which is trained
        filename = 'finalized_model.sav'
        pickle.dump(crf, open(filename, 'wb'))

        # Prediction on train
        pred = crf.predict(x_train)

        # printing classification report and Accuracy
        print('\n \n Prediction On Trained Data:\n \n',
              flat_classification_report(y_train, pred))
        print('Accuracy:', flat_accuracy_score(y_train, pred))
Esempio n. 25
0
def main(path_train, path_test, path_pred, path_crf, take_first, dev_size):
    print("loading train corpus..")
    _, X_raw, y = load_corpus(path_train, take_first=take_first)
    print("extracting features from train corpus..")
    fe = TaggerFeatureExtractor()
    X = fe.fit_transform(tqdm(X_raw))
    print("training..")
    crf = CRF(algorithm='ap', verbose=True, max_iterations=10)
    if dev_size:
        X, X_dev, y, y_dev = train_test_split(X, y, test_size=dev_size)
    else:
        X_dev, y_dev = None, None
    crf.fit(X, y, X_dev, y_dev)

    print("saving..")
    joblib.dump({'fe': fe, 'crf': crf}, path_crf, compress=2)

    print("loading test corpus..")
    corpus, X_test_raw, y_test = load_corpus(path_test)
    print("extracting features from test corpus..")
    X_test = fe.transform(X_test_raw)
    print("predicting..")
    y_pred = crf.predict(tqdm(X_test))

    print("saving results..")
    sents_pred = y_pred_to_sents_pred(corpus, y_pred)
    conll.write_sents(sents_pred, path_pred)
Esempio n. 26
0
    def __init__(self, algo: str = 'lbfgs', min_freq: int = 0,
                 all_states: bool = False, max_iter: int = 100,
                 epsilon: float = 1e-5, delta: float = 1e-5):
        """

        :param algo: optimization algorithm (lbfgs, l2sgd, ap, pa, arow)
        :param min_freq: threshold of ignoring feature
        :param all_states: if True, consider combinations
                           of missing features and labels
        :param max_iter: max iteration size
        :param epsilon: learning rate
        :param delta: stop training threshold
        """

        self._algo = algo
        self._min_freq = min_freq
        self._all_states = all_states
        self._max_iter = max_iter
        self._epsilon = epsilon
        self._delta = delta
        self.model = CRF(algorithm=algo,
                         min_freq=min_freq,
                         all_possible_states=all_states,
                         max_iterations=max_iter,
                         epsilon=epsilon,
                         delta=delta)
Esempio n. 27
0
def training_crf(training_cue, data, dataset):

    getter = get_frase(data)
    frases = getter.get_frase

    get_negaciones(data)

    X = [sent2features(f, training_cue) for f in frases]
    y = [sent2labels(f, training_cue) for f in frases]

    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=True,
              verbose=True)

    pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

    crf.fit(X, y)

    if training_cue == 'cue':
        model_filename = os.getcwd(
        ) + '/models/' + dataset + '/crf_cue_model.pkl'
    else:
        model_filename = os.getcwd(
        ) + '/models/' + dataset + '/crf_sco_model.pkl'

    with open(model_filename, 'wb') as file_model:
        pickle.dump(crf, file_model)

    return (y, pred, crf)
Esempio n. 28
0
def test_crf(xseq, yseq, algorithm):
    crf = CRF(algorithm=algorithm)
    crf.fit([xseq], [yseq])

    y_pred = crf.predict([xseq])
    if algorithm != 'ap':  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]
Esempio n. 29
0
    def train(self, model_name, tagged_sentences):
        # Split the dataset for training and testing
        cutoff = int(.75 * len(tagged_sentences))
        training_sentences = tagged_sentences[:cutoff]
        test_sentences = tagged_sentences[cutoff:]

        X_train, y_train = transform_to_dataset(training_sentences)
        X_test, y_test = transform_to_dataset(test_sentences)
        print(len(X_train))
        print(len(X_test))


        print("Training Started........")
        print("it will take time according to your dataset size..")
        model = CRF()
        model.fit(X_train, y_train)
        print("Training Finished!")
        
        print("Evaluating with Test Data...")
        y_pred = model.predict(X_test)
        print("Accuracy is: ")
        print(metrics.flat_accuracy_score(y_test, y_pred))
        
        pickle.dump(model, open(model_name, 'wb'))
        print("Model Saved!")
Esempio n. 30
0
 def __init__(self, train, dev, test):
     self.model = CRF(algorithm='lbfgs',
                      c1=0.1,
                      c2=0.1,
                      max_iterations=100,
                      all_possible_transitions=False)
     self.X = np.array([self.sent2features(s) for s in sent])
     self.y = np.array([self.sent2labels(s) for s in sent])
Esempio n. 31
0
 def __init__(self, is_save=False):
     self.crf = CRF(algorithm='lbfgs',
                    c1=0.1,
                    c2=0.1,
                    max_iterations=100,
                    all_possible_transitions=True)
     self.is_save = is_save
     self.save_model = "crf.model"
Esempio n. 32
0
class CRFNerModel(object):
    def __init__(self, is_save=False):
        self.crf = CRF(algorithm='lbfgs',
                       c1=0.1,
                       c2=0.1,
                       max_iterations=100,
                       all_possible_transitions=True)
        self.is_save = is_save
        self.save_model = "crf.model"

    def fit(self, train_x, train_y):
        self.crf.fit(train_x, train_y)

        if self.is_save:
            self.dump_model()

    def predict(self, input_x):
        input_x = list(input_x)
        input_feature = [sent2features(input_x)]
        return self.crf.predict(input_feature)

    def dump_model(self):
        model_data = pickle.dumps(self.crf)
        with open(self.save_model, "wb") as f:
            f.write(model_data)

    def load_model(self):
        with open(self.save_model, "rb") as f:
            model_data = f.read()
        self.crf = pickle.loads(model_data)

    def predict_list(self, input_list):
        return self.crf.predict(input_list)

    def extract_ner(self, input_x):
        extract_ner = []
        res = self.predict(input_x)

        start = None
        label = None
        for i, x in enumerate(res[0]):
            if x == "O":
                if start is not None:
                    extract_ner.append((start, i, label, input_x[start:i]))
                    start = None
                    label = None
            else:
                xindex, xlabel = x.split("-")
                if xindex == "B":
                    if start is not None:
                        extract_ner.append((start, i, label, input_x[start:i]))
                    start = i
                    label = xlabel
                else:
                    if label != xlabel:
                        start = None
                        label = None
        return extract_ner
Esempio n. 33
0
def test_crf_score(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    score = crf.score([xseq], [yseq])
    if algorithm != "ap":
        assert score == 1.0
    else:  # Averaged Perceptron is regularized too much
        assert score > 0.8
Esempio n. 34
0
def test_crf_pickling(xseq, yseq, algorithm):
    crf = CRF(algorithm=algorithm)
    crf.fit([xseq], [yseq])
    data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL)

    crf2 = pickle.loads(data)
    score = crf2.score([xseq], [yseq])
    if algorithm != "ap":
        assert score == 1.0
    else:  # Averaged Perceptron is regularized too much
        assert score > 0.8
    assert crf2.algorithm == algorithm
Esempio n. 35
0
def test_crf_verbose(xseq, yseq, algorithm, use_dev):
    crf = CRF(algorithm, verbose=True)

    if use_dev:
        X_dev, y_dev = [xseq], [yseq]
    else:
        X_dev, y_dev = None, None

    crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev)
    y_pred = crf.predict([xseq])
    if algorithm != "ap":  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]
Esempio n. 36
0
def test_crf_marginals(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    y_pred_marginals = crf.predict_marginals([xseq])
    assert len(y_pred_marginals) == 1
    marginals = y_pred_marginals[0]
    assert len(marginals) == len(yseq)

    labels = crf.tagger_.labels()
    for m in marginals:
        assert isinstance(m, dict)
        assert set(m.keys()) == set(labels)
        assert abs(sum(m.values()) - 1.0) < 1e-6
Esempio n. 37
0
def test_crf_model_filename(xseq, yseq, tmpdir):
    path = os.path.join(str(tmpdir), "foo.crfsuite")
    assert not os.path.exists(path)

    # model file is created at a specified location
    crf = CRF(model_filename=path)
    crf.fit([xseq], [yseq])
    assert os.path.exists(path)

    # it is possible to load the model just by passing a file name
    crf2 = CRF(model_filename=path)
    assert crf2.score([xseq], [yseq]) == 1.0

    # crf is picklable
    data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL)
    crf3 = pickle.loads(data)
    assert crf3.score([xseq], [yseq]) == 1.0
Esempio n. 38
0
def main(arg):

    X_train, y_train = transform_to_dataset(training_sentences,arg)
    X_test, y_test = transform_to_dataset(test_sentences,arg)
    print(len(X_train))
    print(len(X_test))
    print(X_train[0])

    if arg['model_name']=="crf":
        model = CRF()
        model.fit(X_train, y_train)
    elif arg['model_name']=="SVM":
        v = DictVectorizer(sparse=False)
        X_tr = v.fit_transform(X_train)
        X_ts = v.fit_transform(X_test)





    sentence = ['I', 'am', 'Bob', '!']
Esempio n. 39
0
def test_crf_dev_bad_arguments(xseq, yseq):
    crf = CRF()
    X = [xseq] * 20
    y = [yseq] * 20
    with pytest.raises(ValueError):
        crf.fit(X, y, X)
Esempio n. 40
0
def test_predict_without_fit(xseq, algorithm):
    crf = CRF(algorithm)
    with pytest.raises(Exception):
        crf.predict([xseq])