Beispiel #1
0
def clusters_sizes():

    experiment_id = request.args.get('experiment_id', 0, type=int)

    Result = get_result_object(experiment_id)

    records = db_session.query(Result).filter(
        Result.experiment_id == experiment_id)
    label_counts = defaultdict(int)
    for rec in records:
        label_counts[rec.label] += 1

    return render_template('cluster_buttons.html',
                           label_counts=label_counts,
                           experiment_id=experiment_id,
                           all_clusters_label=ALL_CLUSTERS_LABEL)
Beispiel #2
0
def construct_stats(query, experiment_id):
    total = query.count()

    Result = get_result_object(experiment_id)

    positive = query.filter(Result.evaluation == True).count()
    negative = query.filter(Result.evaluation == False).count()
    observed = positive + negative
    positive_bernoulli = bernoulli_trial_probability(positive, observed)
    negative_bernoulli = bernoulli_trial_probability(negative, observed)
    result = {}
    result['positive'] = "{} / {} ({:.2f} - {:.2f})".format(
        positive, observed, *positive_bernoulli)
    result['negative'] = "{} / {} ({:.2f} - {:.2f})".format(
        negative, observed, *negative_bernoulli)
    result['observed'] = "{} / {} ({:.0f} %)".format(observed, total,
                                                     observed / total * 100)
    return result
Beispiel #3
0
def set_evaluation():

    result_id = request.form['result_id']
    evaluation = request.form['evaluation']
    experiment_id = request.form['experiment_id']

    Result = get_result_object(experiment_id)

    if not evaluation:
        evaluation = None
    result_row = db_session.query(Result).filter(Result.id == result_id).one()
    result_row.evaluation = evaluation

    sentence_id = result_row.sentence_id
    # Bubble up the evaluation to all parent experiments.
    parent_experiment_id = db_session.query(Experiment).filter(
        Experiment.id == result_row.experiment_id).one().parent_id
    while parent_experiment_id:
        parent_row = db_session.query(Result)\
                .filter(Result.experiment_id==parent_experiment_id,
                        Result.sentence_id==sentence_id)\
                .one()
        parent_row.evaluation = evaluation
        parent_experiment_id = db_session.query(Experiment).filter(
            Experiment.id == parent_row.experiment_id).one().parent_id

    # Float down the evaluation to all child experiments.
    child_experiment_id = db_session.query(Experiment).filter(
        Experiment.id == result_row.experiment_id).one().child_id
    while child_experiment_id:
        child_row = db_session.query(Result)\
                .filter(Result.experiment_id==child_experiment_id,
                        Result.sentence_id==sentence_id)\
                .one_or_none()
        if not child_row:
            break
        child_row.evaluation = evaluation
        child_experiment_id = db_session.query(Experiment).filter(
            Experiment.id == child_row.experiment_id).one().child_id

    db_session.commit()

    return jsonify(success=1)
Beispiel #4
0
    def save_labels(self, sentence_indexes, labels):
        print('saving labels')
        Result = get_result_object(self.experiment.id)
        # Get the parent experiment evaluations
        evaluations = defaultdict(lambda: None)
        if self.experiment.parent_id:
            parent_results = db_session.query(Result).filter(
                Result.experiment_id == self.experiment.parent_id).all()
            for parent_result in parent_results:
                evaluations[
                    parent_result.sentence_id] = parent_result.evaluation

        for sentence_index, label in zip(sentence_indexes, labels):
            sentence_id = int(sentence_index)
            db_session.add(
                Result(experiment_id=self.experiment.id,
                       label=int(label),
                       sentence_id=sentence_id,
                       evaluation=evaluations[sentence_id]))
        self.experiment.clusters_count = len(set(labels))
        db_session.commit()
Beispiel #5
0
def get_statistics():

    experiment_id = request.args.get('experiment_id', 0, type=int)
    label = request.args.get('label', ALL_CLUSTERS_LABEL, type=int)

    Result = get_result_object(experiment_id)

    # Experiment stats
    query = db_session.query(Result).filter(
        Result.experiment_id == experiment_id)
    experiment_stats = construct_stats(query, experiment_id)

    # Cluster stats
    if label != ALL_CLUSTERS_LABEL:
        query = db_session.query(Result).filter(
            Result.experiment_id == experiment_id, Result.label == label)
        cluster_stats = construct_stats(query, experiment_id)
    else:
        cluster_stats = None

    # Total stats
    parent_experiment_id = db_session.query(Experiment).filter(
        Experiment.id == experiment_id).one().parent_id
    root_experiment_id = None
    while parent_experiment_id:
        root_experiment_id = parent_experiment_id
        parent_experiment_id = db_session.query(Experiment).filter(
            Experiment.id == parent_experiment_id).one().parent_id

    if root_experiment_id:
        query = db_session.query(Result).filter(
            Result.experiment_id == root_experiment_id)
        total_stats = construct_stats(query, experiment_id)
    else:
        total_stats = None

    return render_template('get_statistics.html',
                           total_stats=total_stats,
                           experiment_stats=experiment_stats,
                           cluster_stats=cluster_stats)
Beispiel #6
0
    def run(self):
        try:
            # Log the self.experiment
            start_time = time.time()
            self.experiment = Experiment(input_type=self.input_type_id,
                                         algorithm=self.algorithm_id,
                                         processing=self.processing_method_id,
                                         start_time=func.current_timestamp(),
                                         status='running')

            db_session.add(self.experiment)
            db_session.commit()

            # START PREPROCESSING

            if self.input_type_name == 'Extractor':

                self.experiment.regex_name = self.regex_name_id
                self.experiment.regex_pattern = self.regex_pattern_id
                db_session.commit()

                arrays = self.similar_experiment_arrays(
                    filter_args=[(Experiment.regex_name, self.regex_name_id),
                                 (Experiment.regex_pattern,
                                  self.regex_pattern_id)])
                cosine_similarities = arrays['cosine_similarities']
                sentence_indexes = arrays['sentence_indexes']
                features = arrays['features']

                self.experiment.lines = len(sentence_indexes)

            if self.input_type_name == 'Cluster':

                parent = db_session.query(Experiment).filter(
                    Experiment.id == self.parent_id).one()
                parent.child_id = self.experiment.id

                self.experiment.parent_id = self.parent_id
                self.experiment.parent_label = self.parent_label
                self.experiment.regex_name = parent.regex_name
                self.experiment.regex_pattern = parent.regex_pattern

                cached_arrays_id = parent.cached_arrays_id
                self.experiment.cached_arrays_id = cached_arrays_id

                # cached_arrays = db_session.query(CachedArrays).filter(CachedArrays.id==cached_arrays_id).one()
                cached_arrays = CACHED_ARRAYS[cached_arrays_id]

                arrays = self.h5_data_to_numpy(cached_arrays)
                cosine_similarities = arrays['cosine_similarities']
                sentence_indexes = arrays['sentence_indexes']
                features = arrays['features']

                Result = get_result_object(parent.id)
                rows = db_session.query(Result).filter(
                    Result.experiment_id == self.parent_id,
                    Result.label == self.parent_label)
                cluster_indexes = []

                sentence_indexes_list = sentence_indexes.tolist()
                for row in rows:
                    cluster_indexes.append(
                        sentence_indexes_list.index(row.sentence_id))

                cosine_similarities = cosine_similarities[
                    cluster_indexes][:, cluster_indexes]
                sentence_indexes = sentence_indexes[cluster_indexes]
                if self.algorithm_name == 'DecisionTree':
                    features = features[cluster_indexes, :]

                self.experiment.lines = len(sentence_indexes)

            if self.input_type_name == 'Raw Data' or self.input_type_name == 'Sports Data':
                print(self.input_type_name)
                self.experiment.lines = 602
                db_session.commit()

                arrays = self.similar_experiment_arrays(
                    filter_args=[(Experiment.lines, self.experiment.lines)])

                cosine_similarities = arrays['cosine_similarities']
                sentence_indexes = arrays['sentence_indexes']
                features = arrays['features']

            db_session.commit()

            # raise Exception('debugging')

            # END PREPROCESSING

            # Log preprocessing duration.
            self.experiment.preprocessing_seconds = time.time() - start_time
            db_session.commit()

            # START CLUSTERING
            print('starting clustering')
            clustering_start = time.time()

            self.CLUSTERS = min(20, len(sentence_indexes))
            model = self.get_model()

            if self.algorithm_name != 'DecisionTree':
                model.fit(cosine_similarities)
            else:
                model.fit(cosine_similarities, features)

            self.save_labels(sentence_indexes, model.labels_)
            # END CLUSTERING

            # Log clustering duration.
            self.experiment.clustering_seconds = time.time() - clustering_start
            self.experiment.status = 'finished'
            db_session.commit()
            logger.debug('finished')

        except Exception as e:
            self.experiment.status = 'error'
            db_session.add(
                Traceback(experiment_id=self.experiment.id, message=str(e)))
            logger.exception(str(e))
            db_session.commit()
            CACHED_ARRAYS.close()
            raise e