random_state=0) train_sentences, valid_sentences = [sentences[i] for i in train_indices ], [sentences[i] for i in valid_indices] train_non_norm_labels, valid_non_norm_labels = [ non_norm_labels[i] for i in train_indices ], [non_norm_labels[i] for i in valid_indices] train_non_norm_sentences, valid_non_norm_sentences = [ non_norm_sentences[i] for i in train_indices ], [non_norm_sentences[i] for i in valid_indices] train_norm_labels, valid_norm_labels = [ norm_labels[i] for i in train_indices ], [norm_labels[i] for i in valid_indices] print("Training 1st level CRF models...") crf_model_1 = CRFExtractor(attribute_names=attributes, is_multi_label=True) crf_model_1.train(train_sentences, train_non_norm_labels) print("Training 2nd level CRF models...") crf_model_2 = CRFExtractor(attribute_names=attributes, is_multi_label=False) crf_model_2.train(train_non_norm_sentences, train_norm_labels) print("Training classifier for 2nd level...") classifier = Classifier(attribute_names=attributes, is_multi_label=False, num_features=20000, min_ngram=1, max_ngram=3) classifier.train(train_sentences, train_norm_labels) print("Predicting with 1st level CRF models...")
random_state=0) train_sentences, valid_sentences = [sentences[i] for i in train_indices ], [sentences[i] for i in valid_indices] train_non_norm_labels, valid_non_norm_labels = [ non_norm_labels[i] for i in train_indices ], [non_norm_labels[i] for i in valid_indices] train_non_norm_sentences, valid_non_norm_sentences = [ non_norm_sentences[i] for i in train_indices ], [non_norm_sentences[i] for i in valid_indices] train_norm_labels, valid_norm_labels = [ norm_labels[i] for i in train_indices ], [norm_labels[i] for i in valid_indices] print("Training 1st level CRF models...") seq_model_1 = CRFExtractor(attribute_names=attributes, is_multi_label=True) seq_model_1.train(train_sentences, train_non_norm_labels) print("Saving 1st level CRF models...") utils.save_data_pkl(seq_model_1, 'models/seq_model_1.pkl') if use_seq_to_seq: print("Training seq2seq model for 2nd level inference...") seq_model_2 = NMTDriver(use_char=True) seq_model_2.train(train_non_norm_sentences, train_norm_labels) else: print("Training 2nd level CRF models...") seq_model_2 = CRFExtractor(attribute_names=attributes, is_multi_label=False) seq_model_2.train(train_non_norm_sentences, train_norm_labels)
wm_sentences, wm_pcf_labels = pputils.generate_wm_data_labels(df2_new, wm_attr, wm_msm_map) print("Getting WM attribute value counts...") wm_attr_value_counts = pputils.generate_wm_attr_value_counts(df2_new, wm_attr) print("Creating WM attributes to values map...") wm_attr_vals = pputils.generate_wm_attr_values_normalized(wm_sentences, wm_pcf_labels, set(msm_attr), wm_attr_value_counts, min_count=10) print("Creating train-test indices...") train_indices, valid_indices = train_test_split(range(len(wm_sentences)), test_size=0.2, random_state=0) print("Merging MSM and WM attribute value maps...") combined_attr_vals = pputils.combine_msm_attr_values(msm_wm_map, msm_attr_vals, wm_attr_vals) print("Initializing extractor...") extractor = CRFExtractor(attribute_names=list(set(msm_attr)), attr_values_map=combined_attr_vals, is_multi_label=True) print("Initializing classifier...") classifier = Classifier(attribute_names=list(set(msm_attr)), is_multi_label=True, num_features=20000, min_ngram=1, max_ngram=3) print("Creating training/validation data...") train_sentences, valid_sentences = [wm_sentences[i] for i in train_indices], [wm_sentences[i] for i in valid_indices] train_labels, valid_labels = [wm_pcf_labels[i] for i in train_indices], [wm_pcf_labels[i] for i in valid_indices] print(len(train_sentences), len(valid_sentences)) print("Training extractor...") extractor.train(train_sentences, train_labels) print("Training classifier...") classifier.train(train_sentences, train_labels)