class LogIndexer(object): ''' classdocs ''' def __init__(self,solrAddr): ''' Constructor ''' self.solr = Solr(solrAddr) def index(self,data): for key, value in data.items(): if isinstance(value,datetime.datetime): try: value = solr.core.utc_to_string(value) except: pst = tz.gettz('Europe/Paris') value = value.replace(tzinfo=pst) value = solr.core.utc_to_string(value) data[key] = value try: self.solr.update([data]) except: print "Erreur Index request: " self.solr.commit() print "data indexed"
class SolrUtils: def __init__(self, url): self.url = url self.conn = Solr(url) def addJSONDoc(self, doc): self.conn.update(doc, 'json', commit=False) def commit(self): self.conn.commit()
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr('http://localhost:8983/solr') def test_search(self): response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{'id' : 1}] response = self.solr.update(documents, input_type='json') self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{'id' : 2}] response = self.solr.update(documents, input_type='xml') self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = 'id:1' key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr(os.getenv('SOLR_URL')) def test_search(self): response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{'id': 1}] response = self.solr.update(documents, input_type='json') self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{'id': 2}] response = self.solr.update(documents, input_type='xml') self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = 'id:1' key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass
}, { 'q' : 'foo:bar' } ] # using 10 threads responses = solr.async_search(queries, size=10) #See installation section for further information about how to install this feature. #Indexing documents from mysolr import Solr solr = Solr() # Create documents documents = [ {'id' : 1, 'field1' : 'foo' }, {'id' : 2, 'field2' : 'bar' } ] # Index using json is faster! solr.update(documents, 'json', commit=False) # Manual commit solr.commit()
def del_all_index(server): server = Solr(server) server.delete_by_query('*:*') server.commit()
def index(): client = MongoClient(host=HOST, port=PORT) db = client['crawl'] coll = db['web'] server = Solr(SERVER) max_indexed_id = get_max_indexed_id(server) if not max_indexed_id: max_indexed_id = ObjectId('000000000000') else: max_indexed_id = ObjectId(max_indexed_id) sites = get_host_name() step = 100 count = 0 jdocs = [] for row in coll.find({'_id': {'$gt': max_indexed_id}}).sort([('_id',1)]): jdoc = {} jdoc['id'] = str(row['_id']) if len(jdocs) == 0: start = row['_id'] jdoc['url'] = row['curi:url'] jdoc['site'] = sites[get_url_domain(row['curi:url'])] jdoc['ip'] = row['curi:ip'] if 'curi:processed_at' in row: jdoc['processed_at'] = row['curi:processed_at'] if 'content_type' in row: jdoc['content_type'] = row['content_type'] if 'content_length' in row: jdoc['content_length'] = row['content_length'] if 'class_key' in row: jdoc['class_key'] = row['class_key'] if 'host' in row: jdoc['host'] = row['host'] if 'curi:request' in row: jdoc['request'] = row['curi:request'] if 'content:headers' in row: jdoc['headers'] = row['content:headers'] if 'text' in row: jdoc['text'] = row['text'] if 'title' in row: jdoc['title'] = row['title'] if 'parse:keywords' in row: jdoc['keywords'] = row['parse:keywords'] if 'parse:content-encoding' in row: jdoc['content_encoding'] = row['parse:content-encoding'] if 'content:raw_data' in row: jdoc['raw_data'] = row['content:raw_data'] jdocs.append(jdoc) count = count + 1 if len(jdocs) >= step: end = row['_id'] xx = server.update(jdocs) server.commit() print xx jdocs = [] print('commit %d documents. %s to %s' % (count, start, end)) if len(jdocs) > 0: server.update(jdocs) server.commit()
'_id': True, 'year': True, 'court': True, 'court_level': True, 'url': True, 'name': True, 'content': True, 'tags': True, 'subjects': True }): if count % 100 == 0: print count # don't know how else to get solr to take IDs... doc['_id'] = str(doc['_id']) # include subject tag in list of strings if weigth greater than 0.01 if 'subjects' in doc: sub_tmp = [k for k, v in doc['subjects'].items() if v >= 0.05] doc['subjects'] = sub_tmp count += 1 documents.append(doc) # json indexing supposed to be faster # at least with mysolr, doing them as a big list is much faster for 18300 docs # 3 minutes vs 1 min 53 sec print "updating..." solr.update(documents, 'json', commit=False) print "committing..." solr.commit() print "done..."
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr(os.getenv("SOLR_URL")) def test_search(self): response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q="*:*") i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q="*:*") i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{"id": 1}] response = self.solr.update(documents, input_type="json") self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{"id": 2}] response = self.solr.update(documents, input_type="xml") self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = "id:1" key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass