Esempio n. 1
0
    print("Training seq2seq model for 2nd level inference...")
    seq_model_2 = NMTDriver(use_char=True)
    seq_model_2.train(train_non_norm_sentences, train_norm_labels)
else:
    print("Training 2nd level CRF models...")
    seq_model_2 = CRFExtractor(attribute_names=attributes,
                               is_multi_label=False)
    seq_model_2.train(train_non_norm_sentences, train_norm_labels)

print("Saving 2nd level CRF models...")
utils.save_data_pkl(seq_model_2, 'models/seq_model_2.pkl')

print("Training classifier for 2nd level...")
classifier = Classifier(attribute_names=attributes,
                        is_multi_label=False,
                        num_features=20000,
                        min_ngram=1,
                        max_ngram=3)
classifier.train(train_sentences, train_norm_labels)

print("Saving classifier...")
utils.save_data_pkl(classifier, 'models/classifier.pkl')

print("Predicting with 1st level CRF models...")
preds_1 = seq_model_1.predict(valid_sentences)

print("Predicting with 2nd level models...")
preds_2 = seq_model_2.predict([
    ' '.join(utils.get_tokens(x[attributes[0]][0]))
    if len(x[attributes[0]]) > 0 else '' for x in preds_1
])
Esempio n. 2
0
 wm_attr_value_counts = pputils.generate_wm_attr_value_counts(df2_new, wm_attr)
 
 print("Creating WM attributes to values map...")
 wm_attr_vals = pputils.generate_wm_attr_values_normalized(wm_sentences, wm_pcf_labels, set(msm_attr), wm_attr_value_counts, min_count=10)
 
 print("Creating train-test indices...")
 train_indices, valid_indices = train_test_split(range(len(wm_sentences)), test_size=0.2, random_state=0)
 
 print("Merging MSM and WM attribute value maps...")
 combined_attr_vals = pputils.combine_msm_attr_values(msm_wm_map, msm_attr_vals, wm_attr_vals)
 
 print("Initializing extractor...")
 extractor = CRFExtractor(attribute_names=list(set(msm_attr)), attr_values_map=combined_attr_vals, is_multi_label=True)
 
 print("Initializing classifier...")
 classifier = Classifier(attribute_names=list(set(msm_attr)), is_multi_label=True, num_features=20000, min_ngram=1, max_ngram=3)
 
 print("Creating training/validation data...")
 train_sentences, valid_sentences = [wm_sentences[i] for i in train_indices], [wm_sentences[i] for i in valid_indices]
 train_labels, valid_labels = [wm_pcf_labels[i] for i in train_indices], [wm_pcf_labels[i] for i in valid_indices]
     
 print(len(train_sentences), len(valid_sentences))
 
 print("Training extractor...")
 extractor.train(train_sentences, train_labels)
 
 print("Training classifier...")
 classifier.train(train_sentences, train_labels)
 
 print("Fetching validation results...")
 extraction_results = extractor.predict(valid_sentences)