random_state=0)

train_sentences, valid_sentences = [sentences[i] for i in train_indices
                                    ], [sentences[i] for i in valid_indices]
train_non_norm_labels, valid_non_norm_labels = [
    non_norm_labels[i] for i in train_indices
], [non_norm_labels[i] for i in valid_indices]
train_non_norm_sentences, valid_non_norm_sentences = [
    non_norm_sentences[i] for i in train_indices
], [non_norm_sentences[i] for i in valid_indices]
train_norm_labels, valid_norm_labels = [
    norm_labels[i] for i in train_indices
], [norm_labels[i] for i in valid_indices]

print("Training 1st level CRF models...")
crf_model_1 = CRFExtractor(attribute_names=attributes, is_multi_label=True)
crf_model_1.train(train_sentences, train_non_norm_labels)

print("Training 2nd level CRF models...")
crf_model_2 = CRFExtractor(attribute_names=attributes, is_multi_label=False)
crf_model_2.train(train_non_norm_sentences, train_norm_labels)

print("Training classifier for 2nd level...")
classifier = Classifier(attribute_names=attributes,
                        is_multi_label=False,
                        num_features=20000,
                        min_ngram=1,
                        max_ngram=3)
classifier.train(train_sentences, train_norm_labels)

print("Predicting with 1st level CRF models...")
Example #2
0
                                                random_state=0)

train_sentences, valid_sentences = [sentences[i] for i in train_indices
                                    ], [sentences[i] for i in valid_indices]
train_non_norm_labels, valid_non_norm_labels = [
    non_norm_labels[i] for i in train_indices
], [non_norm_labels[i] for i in valid_indices]
train_non_norm_sentences, valid_non_norm_sentences = [
    non_norm_sentences[i] for i in train_indices
], [non_norm_sentences[i] for i in valid_indices]
train_norm_labels, valid_norm_labels = [
    norm_labels[i] for i in train_indices
], [norm_labels[i] for i in valid_indices]

print("Training 1st level CRF models...")
seq_model_1 = CRFExtractor(attribute_names=attributes, is_multi_label=True)
seq_model_1.train(train_sentences, train_non_norm_labels)

print("Saving 1st level CRF models...")
utils.save_data_pkl(seq_model_1, 'models/seq_model_1.pkl')

if use_seq_to_seq:
    print("Training seq2seq model for 2nd level inference...")
    seq_model_2 = NMTDriver(use_char=True)
    seq_model_2.train(train_non_norm_sentences, train_norm_labels)
else:
    print("Training 2nd level CRF models...")
    seq_model_2 = CRFExtractor(attribute_names=attributes,
                               is_multi_label=False)
    seq_model_2.train(train_non_norm_sentences, train_norm_labels)
Example #3
0
 wm_sentences, wm_pcf_labels = pputils.generate_wm_data_labels(df2_new, wm_attr, wm_msm_map)
 
 print("Getting WM attribute value counts...")
 wm_attr_value_counts = pputils.generate_wm_attr_value_counts(df2_new, wm_attr)
 
 print("Creating WM attributes to values map...")
 wm_attr_vals = pputils.generate_wm_attr_values_normalized(wm_sentences, wm_pcf_labels, set(msm_attr), wm_attr_value_counts, min_count=10)
 
 print("Creating train-test indices...")
 train_indices, valid_indices = train_test_split(range(len(wm_sentences)), test_size=0.2, random_state=0)
 
 print("Merging MSM and WM attribute value maps...")
 combined_attr_vals = pputils.combine_msm_attr_values(msm_wm_map, msm_attr_vals, wm_attr_vals)
 
 print("Initializing extractor...")
 extractor = CRFExtractor(attribute_names=list(set(msm_attr)), attr_values_map=combined_attr_vals, is_multi_label=True)
 
 print("Initializing classifier...")
 classifier = Classifier(attribute_names=list(set(msm_attr)), is_multi_label=True, num_features=20000, min_ngram=1, max_ngram=3)
 
 print("Creating training/validation data...")
 train_sentences, valid_sentences = [wm_sentences[i] for i in train_indices], [wm_sentences[i] for i in valid_indices]
 train_labels, valid_labels = [wm_pcf_labels[i] for i in train_indices], [wm_pcf_labels[i] for i in valid_indices]
     
 print(len(train_sentences), len(valid_sentences))
 
 print("Training extractor...")
 extractor.train(train_sentences, train_labels)
 
 print("Training classifier...")
 classifier.train(train_sentences, train_labels)