コード例 #1
0
    def test_pair_match_with_duplicate_messageId(self):

        # Clean database state
        settings = Settings()
        settings.logger.info('[test] test_pair_match_with_duplicate_messageId')
        mongodb_h = cl_db_handler.MongoDBHandler(settings.MONGODB_USER,
                                                 settings.MONGODB_PWD,
                                                 settings.MONGODB_SERVER)
        mongodb_h.remove_all()
        mongodb_h.create_indexes()

        # Configure test scenario
        settings.CORRECTOR_DOCUMENTS_LIMIT = 10000
        total_pairs_with_unique_message_id = 100
        total_pairs_with_same_message_id = 100

        total_pairs = total_pairs_with_unique_message_id + total_pairs_with_same_message_id
        total_raw_documents = 2 * total_pairs

        # Add pairs with unique MessageId
        raw_documents = []
        for i in range(total_pairs_with_unique_message_id):
            client, producer = ci_helper.create_raw_document_pair()
            raw_documents.append(client.copy())
            raw_documents.append(producer.copy())

        # Add pairs with duplicate MessageId
        same_message_id = "abcde"
        for i in range(total_pairs_with_same_message_id):
            client, producer = ci_helper.create_raw_document_pair()
            client['messageId'] = same_message_id
            producer['messageId'] = same_message_id
            raw_documents.append(client.copy())
            raw_documents.append(producer.copy())

        # Add all to mongodb
        mongodb_h.add_raw_documents(raw_documents)

        # Run Corrector
        c_batch = CorrectorBatch(settings)
        process_dict = dict()
        process_dict['doc_len'] = -1
        c_batch.run(process_dict)

        # Total raw documents
        self.assertEqual(process_dict['doc_len'], total_raw_documents)
        clean_docs = mongodb_h.get_clean_documents()

        # Documents are a pair
        self.assertEqual(len(clean_docs), total_pairs)

        # If all are regular_pair, set size is 1
        match_type_set = set([x['matchingType'] for x in clean_docs])
        self.assertEqual(len(match_type_set), 1)

        # Clean before exit
        mongodb_h.remove_all()
コード例 #2
0
    def test_simple_duplicates_match(self):

        # Clean database state
        settings = Settings()
        settings.logger.info('[test] test_simple_duplicates_match')
        mongodb_h = cl_db_handler.MongoDBHandler(settings.MONGODB_USER,
                                                 settings.MONGODB_PWD,
                                                 settings.MONGODB_SERVER)
        mongodb_h.remove_all()
        mongodb_h.create_indexes()

        # Configure test scenario
        settings.CORRECTOR_DOCUMENTS_LIMIT = 10000

        # 100 unique documents, duplicated 5 times
        total_unique_documents = 100
        total_duplicate_documents = 400
        total_raw_documents = total_unique_documents + total_duplicate_documents

        total_unique_pairs = int(total_unique_documents / 2)

        unique_raw_documents = []
        for i in range(total_unique_pairs):
            client, producer = ci_helper.create_raw_document_pair()
            unique_raw_documents.append(client.copy())
            unique_raw_documents.append(producer.copy())

        raw_documents = []
        for i in range(total_raw_documents):
            i_doc = i % total_unique_documents
            doc = unique_raw_documents[i_doc].copy()
            raw_documents.append(doc)

        mongodb_h.add_raw_documents(raw_documents)

        # Run Corrector
        c_batch = CorrectorBatch(settings)
        process_dict = dict()
        process_dict['doc_len'] = -1
        c_batch.run(process_dict)

        # Total raw documents
        self.assertEqual(process_dict['doc_len'], total_raw_documents)

        # Check total Pair documents
        clean_docs = mongodb_h.get_clean_documents()
        self.assertEqual(len(clean_docs), total_unique_pairs)

        # Check total raw documents after duplicate removal
        raw_docs = mongodb_h.get_raw_documents()
        self.assertEqual(len(raw_docs), total_unique_documents)

        # If all are regular_pair, set size is 1
        match_type_set = set([x['matchingType'] for x in clean_docs])
        self.assertEqual(len(match_type_set), 1)

        # Clean before exit
        mongodb_h.remove_all()
コード例 #3
0
    def test_multiple_orphan_pair_match(self):

        # Clean database state
        settings = Settings()
        settings.logger.info('[test] test_multiple_orphan_pair_match')
        mongodb_h = cl_db_handler.MongoDBHandler(settings.MONGODB_USER,
                                                 settings.MONGODB_PWD,
                                                 settings.MONGODB_SERVER)
        mongodb_h.remove_all()
        mongodb_h.create_indexes()

        # Configure test scenario
        settings.CORRECTOR_DOCUMENTS_LIMIT = 20
        total_raw_documents = 100
        total_pairs = int(total_raw_documents / 2)
        total_steps = 5

        # Add pairs
        raw_documents = []
        for i in range(total_pairs):
            client, producer = ci_helper.create_raw_document_pair(
                orphan_match=True)
            raw_documents.append(client.copy())
            raw_documents.append(producer.copy())
        mongodb_h.add_raw_documents(raw_documents)

        # Run Corrector Multiple Steps
        c_batch = CorrectorBatch(settings)
        docs_processed = 0
        for i in range(total_steps):
            process_dict = dict()
            process_dict['doc_len'] = -1
            c_batch.run(process_dict)
            self.assertNotEqual(process_dict['doc_len'], -1)
            docs_processed += process_dict['doc_len']

        # Total raw documents
        self.assertEqual(docs_processed, total_raw_documents)
        clean_docs = mongodb_h.get_clean_documents()

        # Documents are a pair
        self.assertEqual(len(clean_docs), total_pairs)

        # If all are regular_pair, set size is 1
        match_type_set = set([x['matchingType'] for x in clean_docs])
        self.assertEqual(len(match_type_set), 1)

        # Clean before exit
        mongodb_h.remove_all()
コード例 #4
0
    def test_simple_orphan_pair_match(self):

        # Clean database state
        settings = Settings()
        settings.logger.info('[test] test_simple_orphan_pair_match')
        mongodb_h = cl_db_handler.MongoDBHandler(settings.MONGODB_USER,
                                                 settings.MONGODB_PWD,
                                                 settings.MONGODB_SERVER)
        mongodb_h.remove_all()
        mongodb_h.create_indexes()

        # Configure test scenario
        settings.CORRECTOR_DOCUMENTS_LIMIT = 10
        total_raw_documents = 2

        # Add pairs
        raw_documents = []
        for i in range(int(total_raw_documents / 2)):
            client, producer = ci_helper.create_raw_document_pair(
                orphan_match=True)
            raw_documents.append(client.copy())
            raw_documents.append(producer.copy())
        mongodb_h.add_raw_documents(raw_documents)

        # Run Corrector
        c_batch = CorrectorBatch(settings)
        process_dict = dict()
        process_dict['doc_len'] = -1
        c_batch.run(process_dict)

        # Total raw documents should be 2
        self.assertEqual(process_dict['doc_len'], 2)
        clean_docs = mongodb_h.get_clean_documents()

        # Documents are a pair
        self.assertEqual(len(clean_docs), 1)

        # It is a regular match
        doc = clean_docs[0]
        self.assertEqual(doc['matchingType'], 'orphan_pair')

        # Clean before exit
        mongodb_h.remove_all()
コード例 #5
0
    def test_multiple_many_matchingType_match(self):

        # Clean database state
        settings = Settings()
        settings.logger.info('[test] test_multiple_orphan_pair_match')
        mongodb_h = cl_db_handler.MongoDBHandler(settings.MONGODB_USER,
                                                 settings.MONGODB_PWD,
                                                 settings.MONGODB_SERVER)
        mongodb_h.remove_all()
        mongodb_h.create_indexes()
        # Scenario:
        # 5 orphan pairs (10 docs)
        # 5 regular pairs (10 docs)
        # 5 orphans (5 docs)

        # Configure test scenario
        settings.CORRECTOR_DOCUMENTS_LIMIT = 5
        total_raw_documents = 25
        total_pairs = 15
        total_steps = 5

        raw_documents = []
        # Add pairs multiple matchingType
        # Orphan pairs
        for i in range(5):
            client, producer = ci_helper.create_raw_document_pair(
                orphan_match=True)
            raw_documents.append(client.copy())
            raw_documents.append(producer.copy())
        # Regular pairs
        for i in range(5):
            client, producer = ci_helper.create_raw_document_pair()
            raw_documents.append(client.copy())
            raw_documents.append(producer.copy())
        # Orphans
        for i in range(5):
            client, producer = ci_helper.create_raw_document_pair()
            # Only client documents
            raw_documents.append(client.copy())

        random.shuffle(raw_documents)
        mongodb_h.add_raw_documents(raw_documents)

        # Run Corrector Multiple Steps
        c_batch = CorrectorBatch(settings)
        docs_processed = 0
        for i in range(total_steps):
            process_dict = dict()
            process_dict['doc_len'] = -1
            c_batch.run(process_dict)
            self.assertNotEqual(process_dict['doc_len'], -1)
            docs_processed += process_dict['doc_len']

        # Total raw documents
        self.assertEqual(docs_processed, total_raw_documents)
        clean_docs = mongodb_h.get_clean_documents()

        # Documents are a pair
        self.assertEqual(len(clean_docs), total_pairs)

        # If all are regular_pair, set size is 1
        matching_type_freq = {}
        for x in clean_docs:
            k = x['matchingType']
            if k not in matching_type_freq:
                matching_type_freq[k] = 0
            matching_type_freq[k] += 1

        self.assertEqual(matching_type_freq.get('orphan', None), 5)
        self.assertEqual(matching_type_freq.get('regular_pair', None), 5)
        self.assertEqual(matching_type_freq.get('orphan_pair', None), 5)

        # Clean before exit
        mongodb_h.remove_all()
コード例 #6
0
    def test_multiple_all_producer_duplicates_match(self):

        # Clean database state
        settings = Settings()
        settings.logger.info(
            '[test] test_multiple_all_producer_duplicates_match')
        mongodb_h = cl_db_handler.MongoDBHandler(settings.MONGODB_USER,
                                                 settings.MONGODB_PWD,
                                                 settings.MONGODB_SERVER)
        mongodb_h.remove_all()
        mongodb_h.create_indexes()

        # Configure test scenario
        settings.CORRECTOR_DOCUMENTS_LIMIT = 10
        total_steps = 10

        # 100 unique documents, duplicated 5 times
        total_unique_documents = 1
        total_duplicate_documents = 99
        total_raw_documents = total_unique_documents + total_duplicate_documents

        total_unique_pairs = 1

        unique_raw_documents = []
        for i in range(total_unique_pairs):
            client, producer = ci_helper.create_raw_document_pair()
            # Only producer
            unique_raw_documents.append(producer.copy())

        raw_documents = []
        for i in range(total_raw_documents):
            doc = unique_raw_documents[0].copy()
            raw_documents.append(doc)

        mongodb_h.add_raw_documents(raw_documents)

        # Run Corrector Multiple Steps
        c_batch = CorrectorBatch(settings)
        docs_processed = 0
        for i in range(total_steps):
            process_dict = dict()
            process_dict['doc_len'] = -1
            c_batch.run(process_dict)
            self.assertNotEqual(process_dict['doc_len'], -1)
            docs_processed += process_dict['doc_len']

        # Total raw documents
        self.assertEqual(docs_processed, total_raw_documents)

        # Check total Pair documents
        clean_docs = mongodb_h.get_clean_documents()
        self.assertEqual(len(clean_docs), total_unique_pairs)

        # If all are orphan, set size is 1
        match_type_set = set([x['matchingType'] for x in clean_docs])
        self.assertEqual(len(match_type_set), 1)

        # The only document should be orphan
        self.assertEqual(clean_docs[0]['matchingType'], 'orphan')

        # Clean before exit
        mongodb_h.remove_all()