def eval_func_confidences(self, feature_weights):
        weights_sum = float(sum(feature_weights))
        # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0
        feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights]
        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1)
        reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session)

        actuals = []
        confidences = []

        confidence_vectors = []
        for feature, weight in zip(self.features, feature_weights):
            vi = 0
            for doc in reduced_docs:
                feature_vectors = doc.get_feature_vectors([feature], session)
                confs = cluster(self.cluster_type, 2, feature_vectors)
                for i, confidence in enumerate(confs, 0):
                    if len(confidence_vectors) <= vi:
                        confidence_vectors.append([])
                    confidence_vectors[vi].append(confidence * weight)
                    vi += 1
                    
        for doc in reduced_docs:
            for span in doc._spans:
                actual = 1 if doc.span_is_plagiarized(span) else 0
                actuals.append(actual)

        for vec in confidence_vectors:
            confidences.append(min(1, sum(vec)))

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1)
        roc_auc = sklearn.metrics.auc(fpr, tpr)
        print 'evaluated:', roc_auc, [w for w in feature_weights]
        return roc_auc
def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args):
    '''
    Returns a matrix of dimension <num_passages> x <num_features> where each row holds 
    the confidence that that row was plagiarized according to each feature. In other
    words,
    mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num>

    Note that the transpose of this matrix is built below, and then transposed before returning
    '''

    first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag)
    session = Session()
    reduced_docs = _get_reduced_docs(atom_type, first_training_files, session)

    actuals = []
    
    # feature_conf_matrix[feat][span_index] == Conf. that <span_index>
    # was plag. according to <feat>
    # NOTE that we're ignoring document boundaries in the storage of this 
    # matrix. So <span_index> is not relative to any document
    feature_conf_matrix = [[] for i in xrange(len(features))]
    
    for doc_index in xrange(len(reduced_docs)):
        if doc_index % 10 == 0:
            print 'Working on doc number (in training corpus)', start_doc + doc_index
        doc = reduced_docs[doc_index]
        spans = doc.get_spans()

        for feat_num in xrange(len(features)):
            feat = features[feat_num]
            feature_vecs = doc.get_feature_vectors([feat], session)
            # One column, i.e. confidence values for <feat> over all passages 
            # in <doc>
            confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args)
            # Use append if we care about document_num
            
            feature_conf_matrix[feat_num].extend(confidences)
            
        for span_index in xrange(len(spans)):
            span = spans[span_index]
            
            actuals.append(1 if doc.span_is_plagiarized(span) else 0)
            
    
    rotated = np.matrix(feature_conf_matrix).T

    return rotated, actuals
Ejemplo n.º 3
0
def get_cached_reduced_docs(atom_type, files):
    cached_docs = cached_reduced_docs.get(atom_type, {})
    return_docs = []
    need_to_query = []
    for f in files:
        if f in cached_docs:
            return_docs.append(cached_docs[f])
        else:
            need_to_query.append(f)

    queried = _get_reduced_docs(atom_type, need_to_query, session)
    for q in queried:
        return_docs.append(q)
        cached_docs[q.full_path] = q

    cached_reduced_docs[atom_type] = cached_docs
    return return_docs
Ejemplo n.º 4
0
def get_cached_reduced_docs(atom_type, files):
    cached_docs = cached_reduced_docs.get(atom_type, {})
    return_docs = []
    need_to_query = []
    for f in files:
        if f in cached_docs:
            return_docs.append(cached_docs[f])
        else:
            need_to_query.append(f)
    
    queried = _get_reduced_docs(atom_type, need_to_query, session)  
    for q in queried:
       return_docs.append(q)
       cached_docs[q.full_path] = q
    
    cached_reduced_docs[atom_type] = cached_docs
    return return_docs
    def construct_and_train_nn(self, features, num_files, epochs, filepath,
                               session):
        from plagcomps.evaluation.intrinsic import _get_reduced_docs

        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=num_files)
        reduced_docs = _get_reduced_docs("paragraph", all_test_files, session)

        print 'constructing datasets...'
        # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session)
        dataset = self.read_dataset()
        training_dataset, testing_dataset = dataset.splitWithProportion(0.75)
        print 'dataset lengths:', len(dataset), len(training_dataset), len(
            testing_dataset)
        print

        print 'creating neural network...'
        net = self.create_nn(features, num_hidden_layer_nodes)

        print 'creating trainer...'
        trainer = self.create_trainer(net, training_dataset)

        print 'training neural network for', epochs, 'epochs...'
        trainer.trainEpochs(epochs)

        print 'writing neural network to ' + str(filepath) + '...'
        NetworkWriter.writeToFile(net, filepath)

        print 'testing neural network...'
        confidences = []
        actuals = []
        for point in testing_dataset:
            confidences.append(net.activate(point[0])[0])
            actuals.append(point[1][0])

        print 'confidences|actuals ', zip(confidences, actuals)

        print 'generating ROC curve...'
        matplotlib.use('pdf')
        path, auc = self.roc(confidences, actuals)
        print 'area under curve =', auc
Ejemplo n.º 6
0
    def construct_and_train_nn(self, features, num_files, epochs, filepath, session):
        from plagcomps.evaluation.intrinsic import _get_reduced_docs

        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=num_files)
        reduced_docs = _get_reduced_docs("paragraph", all_test_files, session)
        
        print 'constructing datasets...'
        # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session)
        dataset = self.read_dataset()
        training_dataset, testing_dataset = dataset.splitWithProportion(0.75)
        print 'dataset lengths:', len(dataset), len(training_dataset), len(testing_dataset)
        print

        print 'creating neural network...'
        net = self.create_nn(features, num_hidden_layer_nodes)

        print 'creating trainer...'
        trainer = self.create_trainer(net, training_dataset)

        print 'training neural network for', epochs, 'epochs...'
        trainer.trainEpochs(epochs)

        print 'writing neural network to ' + str(filepath) + '...'
        NetworkWriter.writeToFile(net, filepath)

        print 'testing neural network...'
        confidences = []
        actuals = []
        for point in testing_dataset:
            confidences.append(net.activate(point[0])[0])
            actuals.append(point[1][0])

        print 'confidences|actuals ', zip(confidences, actuals)

        print 'generating ROC curve...'
        matplotlib.use('pdf')
        path, auc = self.roc(confidences, actuals)
        print 'area under curve =', auc