print("Training seq2seq model for 2nd level inference...") seq_model_2 = NMTDriver(use_char=True) seq_model_2.train(train_non_norm_sentences, train_norm_labels) else: print("Training 2nd level CRF models...") seq_model_2 = CRFExtractor(attribute_names=attributes, is_multi_label=False) seq_model_2.train(train_non_norm_sentences, train_norm_labels) print("Saving 2nd level CRF models...") utils.save_data_pkl(seq_model_2, 'models/seq_model_2.pkl') print("Training classifier for 2nd level...") classifier = Classifier(attribute_names=attributes, is_multi_label=False, num_features=20000, min_ngram=1, max_ngram=3) classifier.train(train_sentences, train_norm_labels) print("Saving classifier...") utils.save_data_pkl(classifier, 'models/classifier.pkl') print("Predicting with 1st level CRF models...") preds_1 = seq_model_1.predict(valid_sentences) print("Predicting with 2nd level models...") preds_2 = seq_model_2.predict([ ' '.join(utils.get_tokens(x[attributes[0]][0])) if len(x[attributes[0]]) > 0 else '' for x in preds_1 ])
wm_attr_value_counts = pputils.generate_wm_attr_value_counts(df2_new, wm_attr) print("Creating WM attributes to values map...") wm_attr_vals = pputils.generate_wm_attr_values_normalized(wm_sentences, wm_pcf_labels, set(msm_attr), wm_attr_value_counts, min_count=10) print("Creating train-test indices...") train_indices, valid_indices = train_test_split(range(len(wm_sentences)), test_size=0.2, random_state=0) print("Merging MSM and WM attribute value maps...") combined_attr_vals = pputils.combine_msm_attr_values(msm_wm_map, msm_attr_vals, wm_attr_vals) print("Initializing extractor...") extractor = CRFExtractor(attribute_names=list(set(msm_attr)), attr_values_map=combined_attr_vals, is_multi_label=True) print("Initializing classifier...") classifier = Classifier(attribute_names=list(set(msm_attr)), is_multi_label=True, num_features=20000, min_ngram=1, max_ngram=3) print("Creating training/validation data...") train_sentences, valid_sentences = [wm_sentences[i] for i in train_indices], [wm_sentences[i] for i in valid_indices] train_labels, valid_labels = [wm_pcf_labels[i] for i in train_indices], [wm_pcf_labels[i] for i in valid_indices] print(len(train_sentences), len(valid_sentences)) print("Training extractor...") extractor.train(train_sentences, train_labels) print("Training classifier...") classifier.train(train_sentences, train_labels) print("Fetching validation results...") extraction_results = extractor.predict(valid_sentences)