コード例 #1
0
ファイル: indexer.py プロジェクト: ZackBotkin/stock-project
def index_data():

    docs = get_data()

    client = SolrClient('http://localhost:8983/solr')

    client.index_json('stocks', json.dumps(docs))

    client.commit('stocks')
コード例 #2
0
ファイル: indexer.py プロジェクト: ZackBotkin/stock-project
def index_json():

    client = SolrClient('http://localhost:8983/solr')

    docs = [
        {'id' : '8', 'field8' : 'value8'},
    ]

    client.index_json('test', json.dumps(docs))
    client.commit('test')
コード例 #3
0
 def test_index_multiproc(self):
     index = IndexQ(test_config['indexqbase'], 'testq')
     solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
     buff = []
     files = []
     for doc in self.docs:
         files.append(index.add(doc, finalize=True))
     index.index(solr,test_config['SOLR_COLLECTION'],threads=10)
     solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
     for doc in self.docs:
         res = solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])})
         self.assertTrue(res.get_results_count()==1)
コード例 #4
0
class ClientTestQuery(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.collections.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.collections.create_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass

        #Index Some data
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)

    def test_basic_query(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))

    def test_facet(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': 'true',
            'facet.field': 'facet_test',
        })
        local_facets = {}
        for doc in self.docs:
            try:
                local_facets[doc['facet_test']] += 1
            except:
                local_facets[doc['facet_test']] = 1
        try:
            self.assertDictEqual(local_facets, r.get_facets()['facet_test'])
        except Exception as e:
            logging.info("local")
            logging.info(local_facets)
            logging.info("facets")
            logging.info(r.get_facets())
            raise

    def test_facet_with_fq(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': True,
            'facet.field': 'facet_test',
        })
        first_facet_field = list(r.get_facets()['facet_test'].keys())[0]
        first_facet_field_count = r.get_facets(
        )['facet_test'][first_facet_field]
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.field': 'facet_test',
                'fq': 'facet_test:{}'.format(first_facet_field)
            })
        self.assertEqual(r.get_num_found(), first_facet_field_count)

    def test_facet_range(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.range': 'price',
                'facet.range.start': 0,
                'facet.range.end': 100,
                'facet.range.gap': 10
            })

        prices = [doc['price'] for doc in self.docs]
        div = lambda x: str(x // 10 * 10)
        out = {}
        for k, g in itertools.groupby(sorted(prices), div):
            out[k] = len(list(g)) or 0
        self.assertDictEqual(out, res.get_facets_ranges()['price'])

    def test_facet_pivot(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.pivot': ['facet_test,price', 'facet_test,id']
            })
        out = {}
        for doc in self.docs:
            if doc['facet_test'] not in out:
                out[doc['facet_test']] = {}
            if doc['price'] not in out[doc['facet_test']]:
                out[doc['facet_test']][doc['price']] = 1
            else:
                out[doc['facet_test']][doc['price']] += 1
        self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price'])

    def test_get_field_values_as_list(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
        })
        results = res.get_field_values_as_list('product_name_exact')
        docs = res.docs
        temp = []
        for doc in docs:
            if 'product_name_exact' in doc:
                temp.append(doc['product_name_exact'])
        self.assertEqual(results, temp)

    def test_get_facet_values_as_list(self):
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': 'true',
                'facet.limit': -1,
                'facet.field': 'facet_test',
            })
        self.assertEqual(
            sorted(r.data['facet_counts']['facet_fields']['facet_test'][1::2]),
            sorted(r.get_facet_values_as_list('facet_test')))

    def test_grouped_count_1(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'group': True,
                'group.field': 'id',
                'group.ngroups': True,
            })
        self.assertEqual(r.get_ngroups(), 50)
        self.assertEqual(r.get_ngroups('id'), 50)

    def test_grouped_docs(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'group': True,
                'group.field': 'id',
                'group.ngroups': True,
            })
        self.assertEqual(len(r.docs), 10)
        self.assertTrue('doclist' in r.docs[0])

    def test_grouped_docs(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'group': True,
                'group.field': 'id',
                'group.ngroups': True,
            })
        self.assertEqual(len(r.docs), 10)
        self.assertTrue('doclist' in r.docs[0])

    def test_flat_groups(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'group': True,
            'group.field': 'id'
        })
        flats = r.get_flat_groups()
        self.assertEqual(len(flats), 10)
        self.assertTrue('date' in flats[0])

    def test_json_facet(self):
        '''
        Get a dict of grouped docs
        '''
        #Just lazy getting a new response object
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})

        a = r.get_jsonfacet_counts_as_dict(
            'test', {
                'count': 50,
                'test': {
                    'buckets': [{
                        'count': 10,
                        'pr': {
                            'buckets': [{
                                'count': 2,
                                'unique': 1,
                                'val': 79
                            }, {
                                'count': 1,
                                'unique': 1,
                                'val': 9
                            }]
                        },
                        'pr_sum': 639.0,
                        'val': 'consectetur'
                    }, {
                        'count': 8,
                        'pr': {
                            'buckets': [
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 9
                                },
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 31
                                },
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 33
                                },
                            ]
                        },
                        'pr_sum': 420.0,
                        'val': 'auctor'
                    }, {
                        'count': 8,
                        'pr': {
                            'buckets': [
                                {
                                    'count': 2,
                                    'unique': 1,
                                    'val': 94
                                },
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 25
                                },
                            ]
                        },
                        'pr_sum': 501.0,
                        'val': 'nulla'
                    }]
                }
            })

        b = {
            'test': {
                'auctor': {
                    'count': 8,
                    'pr': {
                        9: {
                            'count': 1,
                            'unique': 1
                        },
                        31: {
                            'count': 1,
                            'unique': 1
                        },
                        33: {
                            'count': 1,
                            'unique': 1
                        }
                    },
                    'pr_sum': 420.0
                },
                'consectetur': {
                    'count': 10,
                    'pr': {
                        9: {
                            'count': 1,
                            'unique': 1
                        },
                        79: {
                            'count': 2,
                            'unique': 1
                        }
                    },
                    'pr_sum': 639.0
                },
                'nulla': {
                    'count': 8,
                    'pr': {
                        25: {
                            'count': 1,
                            'unique': 1
                        },
                        94: {
                            'count': 2,
                            'unique': 1
                        }
                    },
                    'pr_sum': 501.0
                }
            }
        }

        self.assertEqual(a, b)
コード例 #5
0
class ReindexerTests(unittest.TestCase):

    #Methos to create the schema in the collections
    def create_fields(self):
        for coll in self.colls:
            logging.debug("Creating fields for {}".format(coll))
            for field in test_config['collections']['fields']:
                try:
                    self.solr.schema.create_field(coll, field)
                except ValueError:
                    #Filed already exists probably
                    pass

    def create_copy_fields(self):
        for coll in self.colls:
            logging.debug("Creating copy fields for {}".format(coll))
            for field in test_config['collections']['copy_fields']:
                try:
                    self.solr.schema.create_copy_field(coll, field)
                except ValueError:
                    #Filed already exists probably
                    pass

    def setUp(self):
        [self.solr.delete_doc_by_id(coll, '*') for coll in self.colls]
        [self.solr.commit(coll, openSearcher=True) for coll in self.colls]

    def _index_docs(self, numDocs, coll):
        '''
        Generates and indexes in random data while maintaining counts of items in various date ranges.

        These counts in self.date_counts are used later to validate some reindexing methods.

        Brace yourself or have a drink.....
        '''
        self.docs = self.rand_docs.get_docs(numDocs)
        sdate = datetime.datetime.now() - datetime.timedelta(days=180)
        edate = datetime.datetime.now() + datetime.timedelta(days=30)
        self._start_date = sdate
        self._end_date = edate

        import random
        #Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges
        hours = (edate - sdate).days * 24
        hour_range = [x for x in range(int(hours))]
        self.date_counts = {}

        #Save the newest and oldest timestamps as well as assign them to first and second doc
        self.docs[0]['date'] = sdate.isoformat() + 'Z'
        self.date_counts[sdate.date().isoformat()] = 1

        self.docs[1]['date'] = edate.isoformat() + 'Z'
        self.date_counts[edate.date().isoformat()] = 1

        for doc in self.docs[2:]:
            #Make a new date and store a count of it so I can compare later
            new_date = (sdate +
                        datetime.timedelta(hours=random.choice(hour_range)))
            new_date_s = new_date.date().isoformat()
            if new_date_s in self.date_counts:
                self.date_counts[new_date_s] += 1
            else:
                self.date_counts[new_date_s] = 1
            doc['date'] = new_date.isoformat() + 'Z'

        self.solr.index_json(coll, json.dumps(self.docs))
        self.solr.commit(coll, openSearcher=True)
        time.sleep(10)

    def get_all_json_from_indexq(self, index):
        files = index.get_all_as_list()
        out = []
        for efile in files:
            if efile.endswith('.gz'):
                f = gzip.open(efile, 'rt', encoding='utf-8')
            else:
                f = open(efile)
            f_data = json.load(f)
            f.close()
            out.extend(f_data)
        return out

    @classmethod
    def setUpClass(self):
        logging.debug("Starting to run Reindexer Tests")
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.colls = [
            test_config['SOLR_REINDEXER_COLLECTION_S'],
            test_config['SOLR_REINDEXER_COLLECTION_D']
        ]
        self.rand_docs = RandomTestData()

    def test_solr_to_indexq(self):
        '''
        Will export documents from Solr and put them into an IndexQ.
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        for dir in ['_todo_dir', '_done_dir']:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        self._index_docs(5000, self.colls[0])
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        from_solr = self.solr.query('source_coll', {
            'q': '*:*',
            'rows': 5000
        }).docs
        from_solr = reindexer._trim_fields(from_solr)
        self.assertEqual(sorted(from_files, key=lambda x: x['id']),
                         sorted(from_solr, key=lambda x: x['id']))

    def test_ignore_fields(self):
        '''
        Will export documents from Solr and put them into an IndexQ.
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        for dir in ['_todo_dir', '_done_dir']:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index)
        for field in ['_version_', 'product_name_exact']:
            self.assertTrue(field in reindexer._ignore_fields)

    def test_ignore_fields_disable(self):
        '''
        Checks to make sure ignore_fields override works
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index,
                              ignore_fields=False)
        self.assertEqual(reindexer._ignore_fields, False)

    def test_ignore_fields_override(self):
        '''
        Checks to make sure ignore_fields override works
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index,
                              ignore_fields=['_text_', '_any_other_field'])
        self.assertEqual(reindexer._ignore_fields,
                         ['_text_', '_any_other_field'])

    def test_get_copy_fields(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter')
        self.assertEqual(reindexer._get_copy_fields(), [
            field['dest']
            for field in self.solr.schema.get_schema_copyfields(self.colls[0])
        ])

    def test_query_gen(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter')
        self.assertEqual(
            reindexer._get_query('cursor'), {
                'cursorMark': 'cursor',
                'rows': reindexer._rows,
                'q': '*:*',
                'sort': 'id desc'
            })

    def test_query_gen_pershard_distrib(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter',
                              per_shard=True)
        q = reindexer._get_query('cursor')
        self.assertTrue('distrib' in q and q['distrib'] == 'false')

    def test_query_gen_date(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter',
                              date_field='ddddd')
        self.assertEqual(
            reindexer._get_query('cursor'), {
                'cursorMark': 'cursor',
                'rows': reindexer._rows,
                'q': '*:*',
                'sort': 'id desc',
                'sort': 'ddddd asc, id desc'
            })

    def test_remove_copy_fields_from_data(self):
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        for dir in ['_todo_dir', '_done_dir']:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        excluded_fields = reindexer._ignore_fields
        for doc in from_files:
            for field in excluded_fields:
                if field in doc:
                    print(doc)
                #self.assertTrue(field not in doc)

    def test_solr_to_solr(self):
        self._index_docs(50000, self.colls[0])
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=self.solr,
                              dest_coll='dest_coll')
        reindexer.reindex()
        self.assertEqual(
            self.solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
            self.solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
        )

    def test_solr_to_solr_with_date(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='index_date')
        reindexer.reindex()
        try:
            self.assertTrue(solr.transport._action_log[1]['params']['params']
                            ['sort'] == 'index_date asc, id desc')
        except KeyError:
            self.assertTrue(solr.transport._action_log[2]['params']['params']
                            ['sort'] == 'index_date asc, id desc')
        self.assertEqual(
            solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
            solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
        )

    def test_get_edge_date(self):
        '''
        Checks to make sure _get_edge_date returns correct start and end dates.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='index_date')
        solr_end_date_string = reindexer._get_edge_date('date', 'desc')
        solr_start_date_string = reindexer._get_edge_date('date', 'asc')
        self.assertTrue(
            self._start_date.date(),
            datetime.datetime.strptime(solr_start_date_string,
                                       '%Y-%m-%dT%H:%M:%S.%fZ'))
        self.assertTrue(
            self._end_date.date(),
            datetime.datetime.strptime(solr_end_date_string,
                                       '%Y-%m-%dT%H:%M:%S.%fZ'))

    def test_get_date_range_query(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='index_date')
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10', '2015-12-11'), {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'index_date',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1DAY'
            })
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10',
                                            '2015-12-11',
                                            date_field='date123'),
            {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'date123',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1DAY'
            })
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10',
                                            '2015-12-11',
                                            date_field='date123',
                                            timespan='MONTH'),
            {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'date123',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1MONTH'
            })
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10',
                                            '2015-12-11',
                                            timespan='MONTH'),
            {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'index_date',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1MONTH'
            })

    def test_get_date_facet_counts(self):
        '''
        Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts(
            'DAY', 'date', start_date=self._start_date.date().isoformat())
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(
                dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range],
                                                   self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_without_start_date(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts(
            'DAY', 'date')
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(
                dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range],
                                                   self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_not_day(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Testing this one
        with self.assertRaises(ValueError):
            source_facet, dest_facet = reindexer._get_date_facet_counts(
                'MONTH', 'date')

    ## These tests are focused on methods related to resuming re-indexing

    def test_solr_to_solr_resume_checkonly(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        reindexer.resume(check=True)
        #Makes sure nothing got indexed
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)

    def test_solr_to_solr_resume_basic(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has datae
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        reindexer.resume()
        sleep(10)
        #Make sure countc match up after reindex
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))

    def test_solr_to_solr_reindex_and_resume(self):
        '''
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has datae
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        #This gets somehwat of a mid point date in the range.
        midpoint = (datetime.datetime.now() - datetime.timedelta(days=(
            (self._end_date - self._start_date).days / 2)))
        #Reindex approximately half of the data by restricting FQ
        reindexer.reindex(
            fq=['date:[* TO {}]'.format(midpoint.isoformat() + 'Z')])
        sleep(10)
        #Make sure we have at least 20% of the data.
        dest_count = len(
            solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        s_count = len(
            solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        self.assertTrue(s_count > dest_count > s_count * .20)
        reindexer.resume()
        sleep(10)
        #Make sure countc match up after reindex
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))

    def test_solr_to_solr_reindex_and_resume_reverse(self):
        '''
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        #This gets somehwat of a mid point date in the range.
        midpoint = (datetime.datetime.now() - datetime.timedelta(days=(
            (self._end_date - self._start_date).days / 2)))
        #Reindex approximately half of the data by restricting FQ
        reindexer.reindex(
            fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')])
        sleep(10)
        #Make sure we have at least 20% of the data.
        dest_count = len(
            solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        s_count = len(
            solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        self.assertTrue(s_count > dest_count > s_count * .20)
        reindexer.resume()
        sleep(10)
        #Make sure countc match up after reindex
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))

    def test_solr_to_solr_reindexer_per_shard(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)

        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard1_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard2_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()

        self.solr.commit(self.colls[1], openSearcher=True)
        #sloppy check over here, will improve later
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))
コード例 #6
0
ファイル: test_resp.py プロジェクト: cyberj0g/SolrClient
class ClientTestQuery(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.collections.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.collections.create_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass

        #Index Some data
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)

    def test_basic_query(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))

    def test_facet(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': 'true',
            'facet.field': 'facet_test',
        })
        local_facets = {}
        for doc in self.docs:
            try:
                local_facets[doc['facet_test']] += 1
            except:
                local_facets[doc['facet_test']] = 1
        try:
            self.assertDictEqual(local_facets, r.get_facets()['facet_test'])
        except Exception as e:
            logging.info("local")
            logging.info(local_facets)
            logging.info("facets")
            logging.info(r.get_facets())
            raise

    def test_facet_with_fq(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': True,
            'facet.field': 'facet_test',
        })
        first_facet_field = list(r.get_facets()['facet_test'].keys())[0]
        first_facet_field_count = r.get_facets(
        )['facet_test'][first_facet_field]
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.field': 'facet_test',
                'fq': 'facet_test:{}'.format(first_facet_field)
            })
        self.assertEqual(r.get_num_found(), first_facet_field_count)

    def test_facet_range(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.range': 'price',
                'facet.range.start': 0,
                'facet.range.end': 100,
                'facet.range.gap': 10
            })

        prices = [doc['price'] for doc in self.docs]
        div = lambda x: str(x // 10 * 10)
        out = {}
        for k, g in itertools.groupby(sorted(prices), div):
            out[k] = len(list(g)) or 0
        self.assertDictEqual(out, res.get_facets_ranges()['price'])

    def test_facet_pivot(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.pivot': ['facet_test,price', 'facet_test,id']
            })
        out = {}
        for doc in self.docs:
            if doc['facet_test'] not in out:
                out[doc['facet_test']] = {}
            if doc['price'] not in out[doc['facet_test']]:
                out[doc['facet_test']][doc['price']] = 1
            else:
                out[doc['facet_test']][doc['price']] += 1
        self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price'])

    def test_get_field_values_as_list(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
        })
        results = res.get_field_values_as_list('product_name_exact')
        docs = res.docs
        temp = []
        for doc in docs:
            if 'product_name_exact' in doc:
                temp.append(doc['product_name_exact'])
        self.assertEqual(results, temp)

    def test_get_facet_values_as_list(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': 'true',
            'facet.field': 'facet_test',
        })
コード例 #7
0
ファイル: test_client.py プロジェクト: cyberj0g/SolrClient
class ClientTestIndexing(unittest.TestCase):
    #High Level Client Tests
    
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        
        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
                
    def setUp(self):
        self.delete_docs()
        self.commit()
    
    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
        self.commit()
        
    def commit(self):
        self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
        sleep(5)
    
    @unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'],devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
            
    
    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        self.delete_docs()
        self.commit()
    
    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()
    
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    
    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}, rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries +=1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.assertTrue(1000/50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass   

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries +=1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass              
            
    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502):
            self.assertTrue(len(res.docs) == 50)
            queries +=1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass    
コード例 #8
0
    def handle(self, *args, **options):

        total = 0
        cycle = 0

        try:
            # Retrieve the Search  and Field models from the database
            solr = SolrClient(settings.SOLR_SERVER_URL)
            try:
                self.search_target = Search.objects.get(
                    search_id=options['search'])
                self.solr_core = self.search_target.solr_core_name
                self.all_fields = Field.objects.filter(
                    search_id=self.search_target)
                if options['nothing_to_report']:
                    self.search_fields = Field.objects.filter(
                        search_id=self.search_target,
                        alt_format='ALL') | Field.objects.filter(
                            search_id=self.search_target, alt_format='NTR')
                else:
                    self.search_fields = Field.objects.filter(
                        search_id=self.search_target,
                        alt_format='ALL') | Field.objects.filter(
                            search_id=self.search_target, alt_format='')
                for search_field in self.search_fields:
                    self.csv_fields[search_field.field_id] = search_field

                    codes = Code.objects.filter(field_id=search_field)
                    # Most csv_fields will not  have codes, so the queryset will be zero length
                    if len(codes) > 0:
                        code_dict = {}
                        for code in codes:
                            code_dict[code.code_id.lower()] = code
                        self.field_codes[search_field.field_id] = code_dict

            except Search.DoesNotExist as x:
                self.logger.error('Search not found: "{0}"'.format(x))
                exit(-1)
            except Field.DoesNotExist as x1:
                self.logger.error(
                    'Fields not found for search: "{0}"'.format(x1))

            # Process the records in the CSV file one at a time
            with open(options['csv'],
                      'r',
                      encoding='utf-8-sig',
                      errors="ignore") as csv_file:
                csv_reader = csv.DictReader(csv_file, dialect='excel')
                solr_items = []
                for csv_record in csv_reader:

                    # Clear out the Solr core. on the first line
                    if total == 0 and not options['nothing_to_report']:
                        solr.delete_doc_by_query(self.solr_core, "*:*")
                        print("Purging all records")
                    elif total == 0 and options['nothing_to_report']:
                        solr.delete_doc_by_query(self.solr_core, "format:NTR")
                        solr.commit(self.solr_core, softCommit=True)
                        print("Purging NTR records")
                    total += 1
                    cycle += 1

                    # Call plugins if they exist for this search type. This is where a developer can introduce
                    # code to customize the data that is loaded into Solr for a particular search.
                    search_type_plugin = 'search.plugins.{0}'.format(
                        options['search'])
                    if search_type_plugin in self.discovered_plugins:
                        include, filtered_record = self.discovered_plugins[
                            search_type_plugin].filter_csv_record(
                                csv_record, self.search_target,
                                self.csv_fields, self.field_codes,
                                'NTR' if options['nothing_to_report'] else '')
                        if not include:
                            continue
                        else:
                            csv_record = filtered_record
                    # Create a dictionary for each record loaded into  Solr
                    solr_record = {
                        'format':
                        'NTR' if options['nothing_to_report'] else 'DEFAULT'
                    }
                    for csv_field in csv_reader.fieldnames:
                        # Verify that it is a known field
                        if csv_field not in self.csv_fields and csv_field not in (
                                'owner_org_title', 'owner_org'):
                            self.logger.error(
                                "CSV files contains unknown field: {0}".format(
                                    csv_field))
                            exit(-1)
                        if csv_field == 'owner_org_title':
                            continue

                        # Handle multi-valued fields here
                        if self.csv_fields[csv_field].solr_field_multivalued:
                            solr_record[csv_field] = csv_record[
                                csv_field].split(',')
                            # Copy fields fo report cannot use multi-values - so directly populate with original string
                            if self.csv_fields[csv_field].solr_field_export:
                                for extra_field in self.csv_fields[
                                        csv_field].solr_field_export.split(
                                            ','):
                                    solr_record[extra_field] = csv_record[
                                        csv_field]
                        else:
                            solr_record[csv_field] = csv_record[csv_field]

                        # Automatically expand out dates and numbers for use with Solr export handler
                        if self.csv_fields[
                                csv_field].solr_field_type == 'pdate':
                            try:
                                if csv_record[csv_field]:
                                    csv_date = datetime.strptime(
                                        csv_record[csv_field], '%Y-%m-%d')
                                    solr_record[csv_field +
                                                '_en'] = format_date(
                                                    csv_date, locale='en')
                                    solr_record[csv_field +
                                                '_fr'] = format_date(
                                                    csv_date, locale='fr')
                                    if self.csv_fields[
                                            csv_field].is_default_year:
                                        solr_record['year'] = csv_date.year
                                    if self.csv_fields[
                                            csv_field].is_default_month:
                                        solr_record['month'] = csv_date.month
                                else:
                                    solr_record[csv_field + '_en'] = ''
                                    solr_record[csv_field + '_fr'] = ''
                            except ValueError as x2:
                                self.logger.error(
                                    'Invalid date: "{0}"'.format(x2))
                                solr_record[csv_field] = ''
                                continue
                        elif self.csv_fields[csv_field].solr_field_type in [
                                'pint', 'pfloat'
                        ]:
                            if solr_record[csv_field]:
                                if solr_record[csv_field] == '.':
                                    solr_record[csv_field] = "0"
                                csv_decimal = parse_decimal(
                                    solr_record[csv_field], locale='en_US')
                                if self.csv_fields[
                                        csv_field].solr_field_is_currency:
                                    solr_record[csv_field +
                                                '_en'] = format_currency(
                                                    csv_decimal,
                                                    'CAD',
                                                    locale='en_CA')
                                    solr_record[csv_field +
                                                '_fr'] = format_currency(
                                                    csv_decimal,
                                                    'CAD',
                                                    locale='fr_CA')
                                else:
                                    solr_record[csv_field +
                                                '_en'] = format_decimal(
                                                    csv_decimal,
                                                    locale='en_CA')
                                    solr_record[csv_field +
                                                '_fr'] = format_decimal(
                                                    csv_decimal,
                                                    locale='fr_CA')
                            else:
                                solr_record[csv_field + '_en'] = ''
                                solr_record[csv_field + '_fr'] = ''

                        # Lookup the expanded code value from the codes dict of dict
                        if csv_field in self.field_codes:
                            if csv_record[csv_field]:

                                if self.csv_fields[
                                        csv_field].solr_field_multivalued:
                                    codes_en = []
                                    codes_fr = []
                                    for code_value in csv_record[
                                            csv_field].split(","):
                                        if code_value.lower(
                                        ) in self.field_codes[csv_field]:
                                            codes_en.append(
                                                self.field_codes[csv_field]
                                                [code_value.lower()].label_en)
                                            codes_fr.append(
                                                self.field_codes[csv_field]
                                                [code_value.lower()].label_fr)
                                        else:
                                            self.logger.info(
                                                "Unknown code value: {0} for field: {1}"
                                                .format(code_value, csv_field))
                                    solr_record[csv_field + '_en'] = codes_en
                                    solr_record[csv_field + '_fr'] = codes_fr
                                else:
                                    if csv_record[csv_field].lower(
                                    ) in self.field_codes[csv_field]:
                                        solr_record[csv_field +
                                                    '_en'] = self.field_codes[
                                                        csv_field][csv_record[
                                                            csv_field].lower(
                                                            )].label_en
                                        solr_record[csv_field +
                                                    '_fr'] = self.field_codes[
                                                        csv_field][csv_record[
                                                            csv_field].lower(
                                                            )].label_fr
                                    else:
                                        self.logger.info(
                                            "Unknown code value: {0} for field: {1}"
                                            .format(csv_record[csv_field],
                                                    csv_field))
                    solr_record = self.set_empty_fields(solr_record)
                    # Set the Solr ID field (Nothing To Report records are excluded)
                    if not options['nothing_to_report']:
                        if self.search_target.id_fields:
                            id_values = []
                            for id_field in self.search_target.id_fields.split(
                                    ","):
                                id_values.append(csv_record[id_field])
                            solr_record['id'] = ",".join(id_values)
                    else:

                        if 'month' in solr_record:
                            solr_record['id'] = "{0}-{1}-{2}".format(
                                solr_record['owner_org'], solr_record['year'],
                                solr_record['month'])
                        elif 'quarter' in solr_record:
                            solr_record['id'] = "{0}-{1}-{2}".format(
                                solr_record['owner_org'], solr_record['year'],
                                solr_record['quarter'])

                    # Call plugins if they exist for this search type. This is where a developer can introduce
                    # code to customize the data that is loaded into Solr for a particular search.
                    if search_type_plugin in self.discovered_plugins:
                        solr_record = self.discovered_plugins[
                            search_type_plugin].load_csv_record(
                                csv_record, solr_record, self.search_target,
                                self.csv_fields, self.field_codes,
                                'NTR' if options['nothing_to_report'] else '')

                    solr_items.append(solr_record)

                    # Write to Solr whenever the cycle threshold is reached
                    if cycle >= self.cycle_on:
                        # try to connect to Solr up to 10 times
                        for countdown in reversed(range(10)):
                            try:
                                solr.index(self.solr_core, solr_items)
                                print("{0} rows processed".format(total))
                                cycle = 0
                                solr_items.clear()
                                break
                            except ConnectionError as cex:
                                if not countdown:
                                    raise
                                print(
                                    "Solr error: {0}. Waiting to try again ... {1}"
                                    .format(cex, countdown))
                                time.sleep((10 - countdown) * 5)

                # Write and remaining records to Solr and commit
                if cycle > 0:
                    # try to connect to Solr up to 10 times
                    for countdown in reversed(range(10)):
                        try:
                            solr.index(self.solr_core, solr_items)
                            total += len(solr_items)
                            print("{0} rows processed".format(cycle))
                            cycle = 0
                            solr_items.clear()
                            break
                        except ConnectionError as cex:
                            if not countdown:
                                raise
                            print(
                                "Solr error: {0}. Waiting to try again ... {1}"
                                .format(cex, countdown))
                            time.sleep((10 - countdown) * 5)

                solr.commit(self.solr_core, softCommit=True, waitSearcher=True)
                print("Total rows processed: {0}".format(total))

        except Exception as x:
            self.logger.error('Unexpected Error "{0}"'.format(x))
コード例 #9
0
import os
import json
import requests

CC_LINKS_FILES_DIRECTORIES = []
SOLR_INSTANCE_URL = ""
SOLR_CORE = ""

solr_client = SolrClient(SOLR_INSTANCE_URL)


def get_url_content(url):
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36'
    headers = {'User-Agent': user_agent}
    resp = requests.get(url, headers=headers)
    return resp.text


for directory in CC_LINKS_FILES_DIRECTORIES:
    files = [f for f in os.listdir(directory) if os.path.isfile(f)]
    for file in files:
        docs = []
        with open(directory + '/' + file, 'r') as f:
            for line in f:
                json_obj = json.loads(line, encoding="utf-8")
                url = json_obj["url"]
                text = get_url_content(url)
                docs.append({"file_name": file, "html": text})
            solr_client.index(SOLR_CORE, docs)
            solr_client.commit(SOLR_CORE, openSearcher=True)
コード例 #10
0
class SOLRDocumentManager(IDocumentManager):
    def __init__(self, server_address: str, index_name: str) -> None:
        self.client = SolrClient(server_address)
        self.index = index_name
        self._serializer = json_serializer()

    def add(self, document: IndexDocument) -> BaseResponse:
        response = BaseResponse()
        try:
            document.id = document.unique_id
            doc_body = self._serializer.serialize([document])
            solr_response = self.client.index_json(self.index, doc_body)
            if not solr_response:
                return response.set_error(
                    Error("IntegrationError", 500,
                          "Index failed to add index!"))
            self.client.commit(self.index,
                               openSearcher=True,
                               waitSearcher=False)
            response = BaseResponse(True)
        except BasicException as e:
            response.set_error(Error("InternalServerError", 500, e.message))
        except Exception as e:
            response.set_error(
                Error("InternalServerError", 500, 'Unknown error occurred!'))
        return response

    def delete(self, unique_id: str) -> BaseResponse:
        response = BaseResponse()
        try:
            solr_response = self.client.delete_doc_by_id(self.index, unique_id)
            if not solr_response:
                return response.set_error(
                    Error("IntegrationError", 500,
                          "Index failed to delete index!"))
            self.client.commit(self.index,
                               openSearcher=True,
                               waitSearcher=False)
            response = BaseResponse(True)
        except BasicException as e:
            response.set_error(Error("InternalServerError", 500, e.message))
        except Exception as e:
            print(e)
            response.set_error(
                Error("InternalServerError", 500, 'Unknown error occurred!'))
        return response

    def get(self, unique_id: str) -> DocumentResponse:
        pass

    def search(self, query: SearchQuery) -> SearchResult:
        solr_query = ""
        solr_field_query = ""
        solr_range_query = []
        for criteria in query.searchCriteria:
            solr_field_query += criteria.field + '^' + str(
                criteria.weight) + " "
            words = criteria.term.split(" ")
            for word in words:
                word = word.lower()
                solr_query += " " + word
        for range_criteria in query.rangeCriteria:
            solr_range_query.append(range_criteria.field + ":[" +
                                    str(range_criteria.minimum) + " TO " +
                                    str(range_criteria.maximum) + "]")
        data = {
            "q": solr_query.strip(),
            "offset": query.page * query.items,
            "limit": query.items,
            "filter": solr_range_query,
            "defType": "edismax",
            "qf": solr_field_query
        }
        result = SearchResult(0, False)
        try:
            response = self.client.query_raw(self.index, data)
            result = SearchResult(response['response']['numFound'], True)
            for document in response['response']['docs']:
                result.add_result(
                    self._serializer.deserialize(document,
                                                 self.index_object_type))
        except Exception as e:
            result.set_error(
                Error("InternalServerError", 500, 'Unknown error occurred!'))
        return result
コード例 #11
0
ファイル: test_resp.py プロジェクト: cyberj0g/SolrClient
class ClientTestQuery(unittest.TestCase):
    
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],devel=True,auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
        
        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.collections.delete_copy_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.collections.create_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        
        #Index Some data
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
    
    def test_basic_query(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        
    def test_facet(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':'true',
            'facet.field':'facet_test',
        })
        local_facets = {}
        for doc in self.docs:
            try:
                local_facets[doc['facet_test']] +=1
            except:
                local_facets[doc['facet_test']] = 1
        try:
            self.assertDictEqual(local_facets,r.get_facets()['facet_test'])
        except Exception as e:
            logging.info("local")
            logging.info(local_facets)
            logging.info("facets")
            logging.info(r.get_facets())
            raise
    
    def test_facet_with_fq(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.field':'facet_test',
        })
        first_facet_field = list(r.get_facets()['facet_test'].keys())[0]
        first_facet_field_count = r.get_facets()['facet_test'][first_facet_field]
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.field':'facet_test',
            'fq':'facet_test:{}'.format(first_facet_field)
        })
        self.assertEqual(r.get_num_found(),first_facet_field_count)
        
    def test_facet_range(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.range':'price',
            'facet.range.start':0,
            'facet.range.end':100,
            'facet.range.gap':10
            })
        
        prices = [doc['price'] for doc in self.docs]
        div = lambda x: str(x//10 * 10)
        out = {}
        for k,g in itertools.groupby(sorted(prices),div):
            out[k] = len(list(g)) or 0
        self.assertDictEqual(out,res.get_facets_ranges()['price'])
    
    def test_facet_pivot(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.pivot':['facet_test,price','facet_test,id']
        })
        out = {}
        for doc in self.docs:
            if doc['facet_test'] not in out:
                out[doc['facet_test']] = {}
            if doc['price'] not in out[doc['facet_test']]:
                out[doc['facet_test']][doc['price']]=1
            else:
                out[doc['facet_test']][doc['price']]+=1
        self.assertDictEqual(out,res.get_facet_pivot()['facet_test,price'])
        
    def test_get_field_values_as_list(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            })
        results = res.get_field_values_as_list('product_name_exact')
        docs = res.docs
        temp = []
        for doc in docs:
            if 'product_name_exact' in doc:
                temp.append(doc['product_name_exact'])
        self.assertEqual(results,temp)
        
    def test_get_facet_values_as_list(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':'true',
            'facet.field':'facet_test',
        })
コード例 #12
0
ファイル: test_client.py プロジェクト: tdhaval/SolrClient
class ClientTestIndexing(unittest.TestCase):
    @classmethod
    def setUpClass(self):

        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except Exception as e:
                pass

        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],
                                              field)
            except Exception as e:
                pass

    def setUp(self):
        self.delete_docs()
        self.commit()

    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')
        self.commit()

    def commit(self):
        # softCommit because we don't care about data on disk
        self.solr.commit(test_config['SOLR_COLLECTION'],
                         openSearcher=True,
                         softCommit=True)

    def test_down_solr_exception(self):
        # connect to "down" sorl host
        s = SolrClient('http://*****:*****@unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'], devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest', {'q': 'not_gonna_happen'})

    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        self.delete_docs()
        self.commit()

    def test_get(self):
        doc_id = '1'
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps([{
                                 'id': doc_id
                             }]))
        # this returns the doc!
        self.solr.get(test_config['SOLR_COLLECTION'], doc_id)
        with self.assertRaises(NotFoundError):
            self.solr.get(test_config['SOLR_COLLECTION'], '5')

    def test_mget(self):
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps([{
                                 'id': '1'
                             }]))
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps([{
                                 'id': '5'
                             }]))
        docs = self.solr.mget(test_config['SOLR_COLLECTION'], ('5', '1'))
        self.assertEqual(len(docs), 2)

    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()

    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries += 1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.assertTrue(1000 / 50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries += 1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50,
                                          max_start=502):
            self.assertTrue(len(res.docs) == 50)
            queries += 1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_cursor_query(self):
        self.docs = self.rand_docs.get_docs(2000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []

        for res in self.solr.cursor_query(test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'rows': 100
        }):
            self.assertTrue(len(res.docs) == 100)
            queries += 1
            docs.extend(res.docs)

        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass