コード例 #1
0
ファイル: test_tasks.py プロジェクト: the-deep/DEEPL
    def test_assign_cluster_to_doc(self):
        # first remove all existing clusters
        ClusteringModel.objects.all().delete()
        # and then create a new cluster
        create_new_clusters(self.cluster_name, self.group_id, self.n_clusters)
        assert ClusteringModel.objects.all().count() == 1
        model = ClusteringModel.objects.last()
        # add new classifiedDocument
        doc = ClassifiedDocument.objects.create(
            classifier=self.doc_sample.classifier,
            group_id=self.doc_sample.group_id,
            text="This is another text",
            classification_label="dummy_label"
        )
        labels_data = model.get_labels_data()
        len_docs = len(labels_data.keys())
        unclustered = get_unclustered_docs(model)
        assert unclustered, "There should be 1 unclustered doc"

        assign_cluster_to_doc(doc.id)

        newmodel = ClusteringModel.objects.last()
        labels_data = newmodel.get_labels_data()
        newlen_docs = len(labels_data.keys())
        # also check number of docs has increased. Read from file
        assert newlen_docs == len_docs + 1, "Since one doc is added"
        assert newmodel.ready
コード例 #2
0
ファイル: cluster_docs.py プロジェクト: the-deep/DEEPL
def create_document_clusters(name,
                             group_id,
                             n_clusters,
                             CLUSTER_CLASS=KMeansDocs,
                             doc2vec_group_id=None,
                             recreate=True):
    """
    Create document clusters(ClusteringModel object) based on input params
    @name: name of the model
    @group_id: group_id of the model
    @CLUSTER_CLASS: class on which the clustring(KMeans) is based
    @doc2vec_group_id: relevant if clusterclass is KMeansDoc2Vec, get doc2vec
        model and load vectors from it
    """
    # first check if group_id already exists or not
    try:
        ClusteringModel.objects.get(group_id=group_id)
        if not recreate:
            raise Exception(
                "Cluster model with group_id {} already exists".format(
                    group_id))
    except ClusteringModel.DoesNotExist:
        pass
        # create new clustering model
    create_new_clusters(name, group_id, n_clusters, CLUSTER_CLASS,
                        doc2vec_group_id)
コード例 #3
0
 def test_with_valid_doc_id(self):
     # first create clusters
     create_new_clusters("test_cluster", self.group_id, 2)
     params = {'doc_id': self.doc_id}  # we created two docs
     response = self.client.post(self.url, params)
     assert response.status_code == 200
     data = response.json()
     assert 'similar_docs' in data
     for docid in data['similar_docs']:
         assert isinstance(docid, int)
コード例 #4
0
 def test_valid_doc_and_doc_id(self):
     # first create clusters
     create_new_clusters("test_cluster", self.group_id, 2)
     params = {
         'doc': 'aeroplane, pilot prime minister',
         'group_id': self.group_id
     }
     response = self.client.post(self.url, params)
     assert response.status_code == 200
     data = response.json()
     assert 'similar_docs' in data
     for docid in data['similar_docs']:
         assert isinstance(docid, int)
コード例 #5
0
 def test_data_files_created(self):
     model = create_new_clusters('test', self.group_id, 2)
     path = self.get_model_path(model)
     center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME)
     labels_path = os.path.join(path,
                                settings.CLUSTERED_DOCS_LABELS_FILENAME)
     relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
     center_resource = Resource(center_path, Resource.FILE)
     labels_resource = Resource(labels_path, Resource.FILE)
     relevant_resource = Resource(relevant_path, Resource.FILE)
     try:
         center_resource.validate()
     except Exception as e:
         assert False, "No center data stored. " + e.args
     else:
         data = json.loads(center_resource.get_data())
         assert isinstance(data, dict)
     try:
         labels_resource.validate()
     except Exception as e:
         assert False, "No levels data stored. " + e.args
     else:
         data = json.loads(labels_resource.get_data())
         assert isinstance(data, dict)
     try:
         relevant_resource.validate()
     except Exception as e:
         assert False, "No relevant data stored. " + e.args
     else:
         data = json.loads(relevant_resource.get_data())
         assert isinstance(data, list)
         assert not all(map(lambda x: len(x) < 2, data))
コード例 #6
0
 def test_cluster_data_not_fully_clustered(self):
     """Test by sending valid data"""
     # Remove clusters
     ClusteringModel.objects.all().delete()
     # first create a cluster
     cluster_model = create_new_clusters("test", self.group_id,
                                         self.num_clusters)
     # add a classifed doc
     ClassifiedDocument.objects.create(text="test test test",
                                       group_id=self.group_id,
                                       classifier=self.classifier)
     params = {'cluster_model_id': cluster_model.id}
     resp = self.client.get(self.url, params)
     assert resp.status_code == 200
     data = resp.json()
     assert 'keywords' in data
     assert 'docs' in data
     assert 'full_clustered' in data
     assert not data[
         'full_clustered'], "If doc is added, model should not be fully clustered"  # noqa
     assert isinstance(data['keywords'], list)
     for entry in data['keywords']:
         assert isinstance(entry, dict)
         assert 'cluster' in entry
         assert 'score' in entry
         assert 'value' in entry
コード例 #7
0
 def test_cluster_data_fully_clustered(self):
     """Test by sending valid data"""
     # Remove clusters
     ClusteringModel.objects.all().delete()
     # first create a cluster
     cluster_model = create_new_clusters("test", self.group_id,
                                         self.num_clusters)
     params = {'cluster_model_id': cluster_model.id}
     resp = self.client.get(self.url, params)
     assert resp.status_code == 200
     data = resp.json()
     assert 'full_clustered' in data
     assert data[
         'full_clustered'], "Recently created model should be fully clustered"  # noqa
     assert isinstance(data, dict)
     assert 'keywords' in data
     assert 'docs' in data
     assert isinstance(data['docs'], dict)
     for entry in data['keywords']:
         assert isinstance(entry, dict)
         assert 'cluster' in entry
         assert 'score' in entry
         assert 'value' in entry
     for label, docs in data['docs'].items():
         assert isinstance(docs, list)
         assert docs, "Docs should not be empty for a cluster"
         for docid in docs:
             assert isinstance(docid, int)
コード例 #8
0
ファイル: test_tasks.py プロジェクト: the-deep/DEEPL
 def test_recluster(self):
     # first remove all existing clusters
     ClusteringModel.objects.all().delete()
     # and then create a new cluster
     create_new_clusters(self.cluster_name, self.group_id, self.n_clusters)
     assert ClusteringModel.objects.all().count() == 1
     model = ClusteringModel.objects.last()
     recluster(model)
     newmodel = ClusteringModel.objects.last()
     assert newmodel.ready
     assert newmodel.last_clustering_started \
         >= model.last_clustering_started
     assert newmodel.last_clustered_on >= model.last_clustered_on
     # also check size vs cluster score file created
     data = model.get_cluster_score_vs_size_data()
     assert data is not None
     assert data != []
     assert isinstance(data, list)
コード例 #9
0
ファイル: test_tasks.py プロジェクト: the-deep/DEEPL
 def test_new_cluster_created(self):
     assert ClusteringModel.objects.all().count() == 0
     create_new_clusters(self.cluster_name, self.group_id, self.n_clusters)
     assert ClusteringModel.objects.all().count() == 1
     model = ClusteringModel.objects.last()
     assert model.group_id == self.group_id
     assert model.ready
     # test appropriate files created
     dirname = os.path.join(
         self.test_cluster_data_dir,
         "cluster_model_{}".format(model.id)
     )
     assert os.path.isdir(dirname)
     # also check size vs cluster score file created
     data = model.get_cluster_score_vs_size_data()
     assert data is not None
     assert data != []
     assert isinstance(data, list)
コード例 #10
0
 def test_clustered_prepared_resposne(self):
     # create a clustered model
     cluster_model = create_new_clusters("test_cluster", self.group_id, 2)
     params = self.valid_params
     response = self.client.post(self.api_url, params)
     assert response.status_code == 201
     data = response.json()
     assert 'cluster_model_id' in data
     assert isinstance(data['cluster_model_id'], int)
     assert data['cluster_model_id'] == cluster_model.id
コード例 #11
0
 def test_all_clustered_if_docs_added(self):
     # First remove existing clsuter models
     ClusteringModel.objects.all().delete()
     # create one
     model = create_new_clusters('test', self.group_id, 2)
     assert model.all_clustered, "All docs should be clustered while cluster model is created new"  # noqa
     # now add a Classified Document with same group_id
     ClassifiedDocument.objects.create(text="test text",
                                       classifier=self.classifier,
                                       group_id=self.group_id)
     # re get the model
     model = ClusteringModel.objects.get(id=model.id)
     assert not model.all_clustered, "all_clustered should be false whenever a new doc is added"  # noqa
コード例 #12
0
 def test_cluster_data_not_ready(self):
     # remove all clusters
     ClusteringModel.objects.all().delete()
     # create one and set ready false
     cluster_model = create_new_clusters("test", self.group_id,
                                         self.num_clusters)
     cluster_model.ready = False
     cluster_model.save()
     params = {'cluster_model_id': cluster_model.id}
     resp = self.client.get(self.url, params)
     assert resp.status_code == 202
     data = resp.json()
     assert 'message' in data
コード例 #13
0
ファイル: test_tasks.py プロジェクト: the-deep/DEEPL
    def test_update_cluster(self):
        # first remove all existing clusters
        ClusteringModel.objects.all().delete()
        # and then create a new cluster
        create_new_clusters(self.cluster_name, self.group_id, self.n_clusters)
        assert ClusteringModel.objects.all().count() == 1
        model = ClusteringModel.objects.last()
        # add new classifiedDocument
        ClassifiedDocument.objects.create(
            classifier=self.doc_sample.classifier,
            group_id=self.doc_sample.group_id,
            text="This is another text",
            classification_label="dummy_label"
        )
        labels_data = model.get_labels_data()
        len_docs = len(labels_data.keys())
        unclustered = get_unclustered_docs(model)
        assert unclustered, "There should be 1 unclustered doc"
        # get current score_vs size data
        old_data = model.get_cluster_score_vs_size_data()

        update_clusters()

        # get new clustere score vs size data
        new_data = model.get_cluster_score_vs_size_data()
        assert len(old_data) + 1 == len(new_data)
        assert new_data[-2] == old_data[-1]

        newmodel = ClusteringModel.objects.last()
        labels_data = newmodel.get_labels_data()
        newlen_docs = len(labels_data.keys())
        # also check number of docs has increased. Read from file
        assert newlen_docs == len_docs + 1, "Since one doc is added"

        assert newmodel.ready
        assert newmodel.last_clustering_started > model.last_clustering_started
        assert newmodel.last_clustered_on > model.last_clustered_on
コード例 #14
0
 def test_get_cluster(self):
     cluster_model = create_new_clusters("test_cluster", self.group_id, 2)
     params = {'model_id': cluster_model.id}
     response = self.client.get(self.api_url, params)
     assert response.status_code == 200
     data = response.json()
     assert 'score' in data
     assert data['score'] >= -1 and data['score'] <= 1
     assert 'doc_ids' in data
     assert isinstance(data['doc_ids'], list)
     for did in data['doc_ids']:
         isinstance(did, int)
     assert 'relevant_terms' in data
     for term in data['relevant_terms']:
         assert isinstance(term, str)
     assert 'group_id' in data
     assert isinstance(data['group_id'], str)
コード例 #15
0
 def setUp(self):
     self.cluster_data_path = 'test_clusters/'
     # create path if not exist
     os.system('mkdir -p {}'.format(self.cluster_data_path))
     os.environ[settings.ENVIRON_CLUSTERING_DATA_LOCATION] = \
         self.cluster_data_path
     # set values
     self.group_id = '1'
     self.num_clusters = 2
     self.url = '/api/re-cluster/'
     self.valid_params = {
         'group_id': self.group_id,
         'num_clusters': self.num_clusters
     }
     self.cluster_model = create_new_clusters("test_cluster", self.group_id,
                                              self.num_clusters)
     # this creates token and adds that to client header
     super().setUp()
コード例 #16
0
 def test_data_files_created(self):
     # First remove existing clsuter models
     ClusteringModel.objects.all().delete()
     # create one
     model = create_new_clusters('test', self.group_id, 2)
     assert model.all_clustered, "All docs should be clustered while cluster model is created new"  # noqa
     path = self.get_model_path(model)
     center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME)
     labels_path = os.path.join(path,
                                settings.CLUSTERED_DOCS_LABELS_FILENAME)
     relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
     size_score_path = os.path.join(
         path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME)
     size_score_fig_path = os.path.join(
         path, settings.CLUSTERS_SCORE_PLOT_FILENAME)
     center_resource = Resource(center_path, Resource.FILE)
     labels_resource = Resource(labels_path, Resource.FILE)
     relevant_resource = Resource(relevant_path, Resource.FILE)
     size_score_resource = Resource(size_score_path, Resource.FILE)
     size_score_fig_resource = Resource(size_score_fig_path, Resource.FILE)
     # check centers
     try:
         center_resource.validate()
     except Exception as e:
         assert False, "No center data stored. " + e.args
     else:
         data = json.loads(center_resource.get_data())
         assert isinstance(data, dict)
     # check labels
     try:
         labels_resource.validate()
     except Exception as e:
         assert False, "No levels data stored. " + e.args
     else:
         data = json.loads(labels_resource.get_data())
         assert isinstance(data, dict)
     # check relevant
     try:
         relevant_resource.validate()
     except Exception as e:
         assert False, "No relevant data stored. " + e.args
     else:
         data = json.loads(relevant_resource.get_data())
         assert isinstance(data, dict)
         for k, v in data.items():
             assert isinstance(v, list)
     # check size vs score
     try:
         size_score_resource.validate()
     except Exception as e:
         assert False, "No score data stored. " + e.args
     else:
         data = json.loads(size_score_resource.get_data())
         assert isinstance(data, list)
         for x in data:
             assert isinstance(x, list)
             assert len(x) == 2
     # check plot saved
     try:
         size_score_fig_resource.validate()
     except Exception as e:
         assert False, "No score plot stored. " + e.args