def search(query_dict):
    #pdb.set_trace()
    #instantiate solr connection
    solr = SolrClient('http://localhost:8983/solr')

    # Generic search if no query input given
    if len(query_dict) == 0:
        query_string = '*:*'
    #retrieve value of field in table and prepare a query string
    else:
        query_string = ''
        query_op = ' OR '
        item_count = 0
        for key in query_dict:
            if len(query_dict[key]) > 0:
                if item_count > 0:
                    query_string = query_string + query_op + key + ':' + query_dict[
                        key]
                else:
                    query_string = query_string + key + ':\"' + query_dict[
                        key] + '\" '
                    for word in query_dict[key].split():
                        if key == "lyrics" and len(
                                query_dict[key].split()) > 1:
                            query_string = query_string + key + ':' + word + ' '
                            #query_string = query_string+key+':'+query_dict[key]
                    item_count += 1
    docs_list = []
    for res in solr.paging_query('lyrics', {
            'q': query_string,
    },
                                 rows=200,
                                 start=0,
                                 max_start=1000):
        for doc in res.data['response']['docs']:
            docs_list.append(doc)
    return docs_list
Esempio n. 2
0
# Open Output file
f = open(json_file_out, 'w')

analyzer = SentimentIntensityAnalyzer()

with open(json_file_name) as json_data:
    #Load Json File
    file_jsons = json.load(json_data)
    for item in file_jsons:
        #Extract Food Item name
        food_name = item['foodItem'].lower()
        data = '"' + food_name + '"'
        text_data = '_text_:%s' % data
        #Query the Solr Index
        for res in solr.paging_query('review_core', {'q': text_data},
                                     rows=10000):
            #Get all values of Review Record
            if res.get_results_count() > 0:
                print(food_name, res.get_results_count())

                json_doc = json.loads(res.get_json())
                output = json_doc['response']['docs']
                for review_recd in output:
                    review_text = review_recd['text'][0]
                    business_id = review_recd['business_id'][0]
                    review_id = review_recd['review_id'][0]
                    stars = review_recd['stars'][0]
                    timestamp_str = review_recd['date'][0]
                    useful = review_recd['useful'][0]
                    funny = review_recd['funny'][0]
                    try:
Esempio n. 3
0
class ClientTestIndexing(unittest.TestCase):
    #High Level Client Tests
    
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        
        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
                
    def setUp(self):
        self.delete_docs()
        self.commit()
    
    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
        self.commit()
        
    def commit(self):
        self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
        sleep(5)
    
    @unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'],devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
            
    
    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        self.delete_docs()
        self.commit()
    
    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()
    
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    
    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}, rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries +=1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.assertTrue(1000/50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass   

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries +=1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass              
            
    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502):
            self.assertTrue(len(res.docs) == 50)
            queries +=1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass    
Esempio n. 4
0
class ClientTestIndexing(unittest.TestCase):
    @classmethod
    def setUpClass(self):

        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except Exception as e:
                pass

        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],
                                              field)
            except Exception as e:
                pass

    def setUp(self):
        self.delete_docs()
        self.commit()

    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')
        self.commit()

    def commit(self):
        # softCommit because we don't care about data on disk
        self.solr.commit(test_config['SOLR_COLLECTION'],
                         openSearcher=True,
                         softCommit=True)

    def test_down_solr_exception(self):
        # connect to "down" sorl host
        s = SolrClient('http://*****:*****@unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'], devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest', {'q': 'not_gonna_happen'})

    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        self.delete_docs()
        self.commit()

    def test_get(self):
        doc_id = '1'
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps([{
                                 'id': doc_id
                             }]))
        # this returns the doc!
        self.solr.get(test_config['SOLR_COLLECTION'], doc_id)
        with self.assertRaises(NotFoundError):
            self.solr.get(test_config['SOLR_COLLECTION'], '5')

    def test_mget(self):
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps([{
                                 'id': '1'
                             }]))
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps([{
                                 'id': '5'
                             }]))
        docs = self.solr.mget(test_config['SOLR_COLLECTION'], ('5', '1'))
        self.assertEqual(len(docs), 2)

    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()

    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries += 1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.assertTrue(1000 / 50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries += 1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50,
                                          max_start=502):
            self.assertTrue(len(res.docs) == 50)
            queries += 1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_cursor_query(self):
        self.docs = self.rand_docs.get_docs(2000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []

        for res in self.solr.cursor_query(test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'rows': 100
        }):
            self.assertTrue(len(res.docs) == 100)
            queries += 1
            docs.extend(res.docs)

        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass