def eval_func_confidences(self, feature_weights): weights_sum = float(sum(feature_weights)) # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0 feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights] IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1) reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session) actuals = [] confidences = [] confidence_vectors = [] for feature, weight in zip(self.features, feature_weights): vi = 0 for doc in reduced_docs: feature_vectors = doc.get_feature_vectors([feature], session) confs = cluster(self.cluster_type, 2, feature_vectors) for i, confidence in enumerate(confs, 0): if len(confidence_vectors) <= vi: confidence_vectors.append([]) confidence_vectors[vi].append(confidence * weight) vi += 1 for doc in reduced_docs: for span in doc._spans: actual = 1 if doc.span_is_plagiarized(span) else 0 actuals.append(actual) for vec in confidence_vectors: confidences.append(min(1, sum(vec))) fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1) roc_auc = sklearn.metrics.auc(fpr, tpr) print 'evaluated:', roc_auc, [w for w in feature_weights] return roc_auc
def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args): ''' Returns a matrix of dimension <num_passages> x <num_features> where each row holds the confidence that that row was plagiarized according to each feature. In other words, mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num> Note that the transpose of this matrix is built below, and then transposed before returning ''' first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag) session = Session() reduced_docs = _get_reduced_docs(atom_type, first_training_files, session) actuals = [] # feature_conf_matrix[feat][span_index] == Conf. that <span_index> # was plag. according to <feat> # NOTE that we're ignoring document boundaries in the storage of this # matrix. So <span_index> is not relative to any document feature_conf_matrix = [[] for i in xrange(len(features))] for doc_index in xrange(len(reduced_docs)): if doc_index % 10 == 0: print 'Working on doc number (in training corpus)', start_doc + doc_index doc = reduced_docs[doc_index] spans = doc.get_spans() for feat_num in xrange(len(features)): feat = features[feat_num] feature_vecs = doc.get_feature_vectors([feat], session) # One column, i.e. confidence values for <feat> over all passages # in <doc> confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args) # Use append if we care about document_num feature_conf_matrix[feat_num].extend(confidences) for span_index in xrange(len(spans)): span = spans[span_index] actuals.append(1 if doc.span_is_plagiarized(span) else 0) rotated = np.matrix(feature_conf_matrix).T return rotated, actuals
def get_cached_reduced_docs(atom_type, files): cached_docs = cached_reduced_docs.get(atom_type, {}) return_docs = [] need_to_query = [] for f in files: if f in cached_docs: return_docs.append(cached_docs[f]) else: need_to_query.append(f) queried = _get_reduced_docs(atom_type, need_to_query, session) for q in queried: return_docs.append(q) cached_docs[q.full_path] = q cached_reduced_docs[atom_type] = cached_docs return return_docs
def construct_and_train_nn(self, features, num_files, epochs, filepath, session): from plagcomps.evaluation.intrinsic import _get_reduced_docs IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=num_files) reduced_docs = _get_reduced_docs("paragraph", all_test_files, session) print 'constructing datasets...' # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session) dataset = self.read_dataset() training_dataset, testing_dataset = dataset.splitWithProportion(0.75) print 'dataset lengths:', len(dataset), len(training_dataset), len( testing_dataset) print print 'creating neural network...' net = self.create_nn(features, num_hidden_layer_nodes) print 'creating trainer...' trainer = self.create_trainer(net, training_dataset) print 'training neural network for', epochs, 'epochs...' trainer.trainEpochs(epochs) print 'writing neural network to ' + str(filepath) + '...' NetworkWriter.writeToFile(net, filepath) print 'testing neural network...' confidences = [] actuals = [] for point in testing_dataset: confidences.append(net.activate(point[0])[0]) actuals.append(point[1][0]) print 'confidences|actuals ', zip(confidences, actuals) print 'generating ROC curve...' matplotlib.use('pdf') path, auc = self.roc(confidences, actuals) print 'area under curve =', auc
def construct_and_train_nn(self, features, num_files, epochs, filepath, session): from plagcomps.evaluation.intrinsic import _get_reduced_docs IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=num_files) reduced_docs = _get_reduced_docs("paragraph", all_test_files, session) print 'constructing datasets...' # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session) dataset = self.read_dataset() training_dataset, testing_dataset = dataset.splitWithProportion(0.75) print 'dataset lengths:', len(dataset), len(training_dataset), len(testing_dataset) print print 'creating neural network...' net = self.create_nn(features, num_hidden_layer_nodes) print 'creating trainer...' trainer = self.create_trainer(net, training_dataset) print 'training neural network for', epochs, 'epochs...' trainer.trainEpochs(epochs) print 'writing neural network to ' + str(filepath) + '...' NetworkWriter.writeToFile(net, filepath) print 'testing neural network...' confidences = [] actuals = [] for point in testing_dataset: confidences.append(net.activate(point[0])[0]) actuals.append(point[1][0]) print 'confidences|actuals ', zip(confidences, actuals) print 'generating ROC curve...' matplotlib.use('pdf') path, auc = self.roc(confidences, actuals) print 'area under curve =', auc