def __init__(self): self.name = settings['BOT_NAME'] self.OUTFolder = settings['OUT_FOLDER'] log.msg("---{0} OUT_FOLDER={1}".format(self.name, self.OUTFolder), level=log.DEBUG) if not os.path.exists(self.OUTFolder): os.makedirs(self.OUTFolder) self.fmt = settings['MARKLOGIC_FORMAT'] self.DO_MarkLogic = settings['MARKLOGIC_UPLOAD'] if self.DO_MarkLogic: self.KeepOrig = settings['MARKLOGIC_KEEP_ORIGINAL_FILE'] self.ML_Host = settings['MARKLOGIC_HOSTNAME'] log.msg("---{0} MARKLOGIC_HOSTNAME={1}".format(self.name, self.ML_Host), level=log.DEBUG) self.admin = settings['MARKLOGIC_ADMIN'] self.password = settings['MARKLOGIC_PASSWORD'] self.db_name = settings['MARKLOGIC_DB'] log.msg("---{0}: MarkLogic Database initialization: {1}".format(self.name, self.db_name), level=log.DEBUG) self.conn = Connection(self.ML_Host, HTTPDigestAuth(self.admin, self.password)) self.db = Database.lookup(self.conn, self.db_name) if self.db == None: log.msg("---{0}: MarkLogic Database {1} does not exist, creating on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG) hosts = Host.list(self.conn) self.db = Database(self.db_name, hosts[0]) self.db.create(self.conn) else: log.msg("---{0}: MarkLogic Database {1} already exists on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG)
def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list_hosts(conn) db = Database("simple-forest-create-test-db", hosts[0].host_name()) db.set_forests(["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create(conn) db = Database.lookup("simple-forest-create-test-db", conn) try: self.assertEqual(2, len(db.forests())) self.assertIn("simple-forest-create-forest1", db.forests()) self.assertIn("simple-forest-create-forest2", db.forests()) finally: db.remove(conn)
def test_simple_create(self): """ TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list_hosts(conn) db = Database("test-db", hosts[0].host_name()) db.create(conn) validate_db = Database.lookup("test-db", conn) try: self.assertIsNotNone(validate_db) self.assertEqual('test-db', validate_db.database_name()) finally: validate_db.remove(conn) validate_db = Database.lookup("test-db", conn) self.assertIsNone(validate_db)
def test_create_paths(self): db = Database(u'testdb') self.assertNotIn('path-namespaces', db._config) return_val = db.add_path_namespace(PathNamespace("inv", "http://foo.bar.com/invoice")) namespaces = db.path_namespaces() self.assertEqual(1, len(namespaces)) self.assertEqual("inv", namespaces[0].prefix()) self.assertEqual('http://foo.bar.com/invoice', namespaces[0].namespace_uri()) self.assertEqual(db, return_val)
def test_create_paths(self): db = Database(u'testdb') assert 'path-namespaces' not in db._config return_val = db.add_path_namespace(PathNamespace("inv", "http://foo.bar.com/invoice")) namespaces = db.path_namespaces() assert 1 == len(namespaces) assert "inv" == namespaces[0].prefix() assert 'http://foo.bar.com/invoice' == namespaces[0].namespace_uri() assert db == return_val
def test_create_paths(self): db = Database(u'testdb') assert 'path-namespaces' not in db._config return_val = db.add_path_namespace( PathNamespace("inv", "http://foo.bar.com/invoice")) namespaces = db.path_namespaces() assert 1 == len(namespaces) assert "inv" == namespaces[0].prefix() assert 'http://foo.bar.com/invoice' == namespaces[0].namespace_uri() assert db == return_val
def test_create_paths(self): db = Database(u'testdb') self.assertNotIn(u'path-namespaces', db.config) return_val = db.add_path_namespace("inv", "http://foo.bar.com/invoice") namespaces = db.path_namespaces() self.assertEqual(1, len(namespaces)) self.assertEqual("inv", namespaces[0][u'prefix']) self.assertEqual('http://foo.bar.com/invoice', namespaces[0][u'namespace-uri']) self.assertEqual(db, return_val)
def test_create_paths(self): db = Database('testdb') self.assertNotIn('path-namespaces', db._config) return_val = db.add_path_namespace( PathNamespace("inv", "http://foo.bar.com/invoice")) namespaces = db.path_namespaces() self.assertEqual(1, len(namespaces)) self.assertEqual("inv", namespaces[0].prefix()) self.assertEqual('http://foo.bar.com/invoice', namespaces[0].namespace_uri()) self.assertEqual(db, return_val)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list(conn) db = Database("detailed-forest-create-test-db", hosts[0]) forest = Forest("detailed-forest-create-forest1", host=hosts[0], large_data_directory=ds.large_data_directory) db.set_forest_names([forest.forest_name()]) db.create(conn) forest = Forest.lookup(conn, "detailed-forest-create-forest1") try: self.assertEqual("detailed-forest-create-forest1", forest.forest_name()) self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.delete(conn)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ hosts = Host.list(self.connection) db = Database("detailed-forest-create-test-db", hosts[0]) forest = Forest("detailed-forest-create-forest1", host=hosts[0]) forest.set_large_data_directory("") db.set_forest_names([forest.forest_name()]) db.create(self.connection) forest = Forest.lookup(self.connection, "detailed-forest-create-forest1") try: assert "detailed-forest-create-forest1" == forest.forest_name() #this isn't in the properties...oddly. #self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.delete(connection=self.connection)
def test_list_databases(self): db_names = Database.list(self.connection) assert len(db_names) > 4 assert "Modules" in db_names assert "Documents" in db_names
def test_list_databases(self): conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) db_names = Database.list(conn) self.assertGreater(len(db_names), 4) self.assertTrue("Modules" in db_names) self.assertTrue("Documents" in db_names)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list_hosts(conn) db = Database("detailed-forest-create-test-db", hosts[0].host_name()) forest = Forest("detailed-forest-create-forest1", host=hosts[0].host_name(), large_data_directory=ds.large_data_directory) db.set_forests([forest]) db.create(conn) forest = Forest.lookup("detailed-forest-create-forest1", conn) try: self.assertEqual("detailed-forest-create-forest1", forest.name()) self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.remove(conn)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ hosts = Host.list(self.connection) db = Database("detailed-forest-create-test-db", hosts[0]) forest = Forest("detailed-forest-create-forest1", host=hosts[0]) forest.set_large_data_directory("") db.set_forest_names([forest.forest_name()]) db.create(self.connection) forest = Forest.lookup(self.connection, "detailed-forest-create-forest1") try: assert "detailed-forest-create-forest1" == forest.forest_name() # this isn't in the properties...oddly. # self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.delete(connection=self.connection)
def test_list_databases(self): conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) databases = Database.list_databases(conn) self.assertGreater(len(databases), 4) db_names = [db.database_name() for db in databases] self.assertTrue("Modules" in db_names) self.assertTrue("Documents" in db_names)
def test_exclude_references(self): db = Database("testdb") field = Field( "invoice-id", excludes=[FieldReference("http://foo.bar.com/invoice", "id")]) self.assertEqual(1, len(field.excludes())) self.assertEqual("http://foo.bar.com/invoice", field.excludes(0).namespace_uri()) self.assertEqual("id", field.excludes(0).localname())
def test_simple_create(self): """ TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list(conn) db = Database("test-db", hosts[0]) db.create(conn) validate_db = Database.lookup(conn, "test-db") try: self.assertIsNotNone(validate_db) self.assertEqual('test-db', validate_db.database_name()) finally: validate_db.delete(conn) validate_db = Database.lookup(conn, "test-db") self.assertIsNone(validate_db)
def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ hosts = Host.list(self.connection) db = Database("simple-forest-create-test-db", hosts[0], connection=self.connection) db.set_forest_names(["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create() db = Database.lookup(self.connection, "simple-forest-create-test-db") try: assert 2 == len(db.forest_names()) assert "simple-forest-create-forest1" in db.forest_names() assert "simple-forest-create-forest2" in db.forest_names() finally: db.delete(connection=self.connection)
def test_create_field(self): db = Database("testdb") self.assertNotIn(u'fields', db.config) field = Field("invoice-id") field.add_path("bill:invoice-id", 1) field.add_path("inv:id", 1) result = db.add_field(field) self.assertIn(u'field', db.config) self.assertEqual(result, db) self.assertEqual(1, len(db.config[u'field'])) field = db.fields(0) self.assertEqual("invoice-id", field.name()) field = db.fields(0) self.assertEqual(2, len(field.paths())) self.assertEqual("bill:invoice-id", field.paths(0)[u'path']) self.assertEqual(1, field.paths()[0][u'weight'])
def test_create_field_range(self): db = Database("foo") field = Field("invoice-id") db.add_field(field) field_range = FieldRange("invoice-id", "int") db.add_index(field_range) index = db.field_range_index(0) self.assertEqual("invoice-id", index.name()) self.assertEqual("int", index.type()) indexes = db.field_range_index() self.assertEqual(1, len(indexes))
def test_apply_config(self): cma = CMA(self.connection) config1 = { "config": [ { "database": [ { "database-name": "CMA_Check1" } ] } ] } config2 = "<configuration xmlns=\"http://marklogic.com/manage/config\"><configs><config><databases><database>" \ "<database-name>CMA_Check2</database-name></database></databases></config></configs></configuration>" cma.apply_config(json.dumps(config1), "application/json") cma.apply_config(config2, "application/xml") validate_db1 = Database.lookup(self.connection, "CMA_Check1") try: assert validate_db1 is not None assert 'CMA_Check1' == validate_db1.database_name() finally: validate_db1.delete(connection=self.connection) validate_db1 = Database.lookup(self.connection, "CMA_Check1") assert validate_db1 is None validate_db2 = Database.lookup(self.connection, "CMA_Check2") try: assert validate_db2 is not None assert 'CMA_Check2' == validate_db2.database_name() finally: validate_db2.delete(connection=self.connection) validate_db2 = Database.lookup(self.connection, "CMA_Check2") assert validate_db2 is None
def test_simple_create(self): """TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ hosts = Host.list(self.connection) db = Database("test-db", hosts[0]) db.create(self.connection) validate_db = Database.lookup(self.connection, "test-db") try: assert validate_db is not None assert "test-db" == validate_db.database_name() finally: validate_db.delete(connection=self.connection) validate_db = Database.lookup(self.connection, "test-db") assert validate_db is None
def test_simple_create(self): """TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ hosts = Host.list(self.connection) db = Database("test-db", hosts[0]) db.create(self.connection) validate_db = Database.lookup(self.connection, "test-db") try: assert validate_db is not None assert 'test-db' == validate_db.database_name() finally: validate_db.delete(connection=self.connection) validate_db = Database.lookup(self.connection, "test-db") assert validate_db is None
def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list(conn) db = Database("simple-forest-create-test-db", hosts[0]) db.set_forest_names( ["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create(conn) db = Database.lookup(conn, "simple-forest-create-test-db") try: self.assertEqual(2, len(db.forest_names())) self.assertIn("simple-forest-create-forest1", db.forest_names()) self.assertIn("simple-forest-create-forest2", db.forest_names()) finally: db.delete(conn)
def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ hosts = Host.list(self.connection) db = Database("simple-forest-create-test-db", hosts[0], connection=self.connection) db.set_forest_names( ["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create() db = Database.lookup(self.connection, "simple-forest-create-test-db") try: assert 2 == len(db.forest_names()) assert "simple-forest-create-forest1" in db.forest_names() assert "simple-forest-create-forest2" in db.forest_names() finally: db.delete(connection=self.connection)
def test_no_database_found(self): conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) db = Database.lookup(conn, "No-Such-Database") self.assertIsNone(db)
# # Copyright 2015 MarkLogic Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0# # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from marklogic.models import Connection, Database, FieldRange, Field, FieldReference from requests.auth import HTTPDigestAuth auth = HTTPDigestAuth("admin", "admin") conn = Connection("192.168.57.141", auth) db = Database("range-field-test", "localhost.localdomain") field = Field("test-field", includes=[FieldReference("http://foo.bar.com/invoice", "id")]) db.add_field(field) db.add_index(FieldRange("test-field", "int")) db.create(conn)
def test_no_database_found(self): db = Database.lookup(self.connection, "No-Such-Database") assert db is None
class ProcessingPipeline(object): def __init__(self): self.name = settings['BOT_NAME'] self.OUTFolder = settings['OUT_FOLDER'] log.msg("---{0} OUT_FOLDER={1}".format(self.name, self.OUTFolder), level=log.DEBUG) if not os.path.exists(self.OUTFolder): os.makedirs(self.OUTFolder) self.fmt = settings['MARKLOGIC_FORMAT'] self.DO_MarkLogic = settings['MARKLOGIC_UPLOAD'] if self.DO_MarkLogic: self.KeepOrig = settings['MARKLOGIC_KEEP_ORIGINAL_FILE'] self.ML_Host = settings['MARKLOGIC_HOSTNAME'] log.msg("---{0} MARKLOGIC_HOSTNAME={1}".format(self.name, self.ML_Host), level=log.DEBUG) self.admin = settings['MARKLOGIC_ADMIN'] self.password = settings['MARKLOGIC_PASSWORD'] self.db_name = settings['MARKLOGIC_DB'] log.msg("---{0}: MarkLogic Database initialization: {1}".format(self.name, self.db_name), level=log.DEBUG) self.conn = Connection(self.ML_Host, HTTPDigestAuth(self.admin, self.password)) self.db = Database.lookup(self.conn, self.db_name) if self.db == None: log.msg("---{0}: MarkLogic Database {1} does not exist, creating on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG) hosts = Host.list(self.conn) self.db = Database(self.db_name, hosts[0]) self.db.create(self.conn) else: log.msg("---{0}: MarkLogic Database {1} already exists on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG) def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: file_path = os.path.join(self.OUTFolder, item['urlSHA1']) + '.' + self.fmt if not os.path.isfile(file_path): with open(file_path, 'w') as f: if self.fmt.lower() == 'xml': out_str = self.Item2XML(item) elif self.fmt.lower() == 'json': out_str = self.Item2Json(item) else: raise DropItem("---{0}: invalid MarkLogic data format {1}!".format(self.fmt)) f.write(out_str) log.msg("---{0} Crawl Results were written to {1} file!".format(self.name, self.fmt), level=log.DEBUG, spider=spider) if self.DO_MarkLogic: ML_DocURI = "/{0}CrawlingResults/{1}.{2}".format(self.name, item['urlSHA1'], self.fmt) log.msg("---{0} Crawl MarkLogic Pipeline. Trying to ingest {1}".format(self.name, ML_DocURI), level=log.DEBUG, spider=spider) coll_name = "{0}CrawlingResults".format(self.name) self.db.load_file(self.conn, file_path, ML_DocURI, [ coll_name ]) log.msg("---{0} Crawl Results were written to MarkLogic database!".format(self.name), level=log.DEBUG, spider=spider) if not self.KeepOrig: os.remove(file_path) else: log.msg("---{0} Crawl duplicate url results found, ignoring!".format(self.name), level=log.DEBUG, spider=spider) return item def Item2XML (self, item): root_name = "{0}CrawlingResult".format(self.name) my_xml = ET.Element( root_name ) my_id = ET.SubElement(my_xml, 'uuid') my_time = ET.SubElement(my_xml, 'crawledTime') my_url = ET.SubElement(my_xml, 'url') my_title = ET.SubElement(my_xml, 'title') my_fullText = ET.SubElement(my_xml, 'fullText') my_titleSHA1 = ET.SubElement(my_xml, 'titleSHA1') my_urlSHA1 = ET.SubElement(my_xml, 'urlSHA1') my_fullTextSHA1 = ET.SubElement(my_xml, 'fullTextSHA1') my_id.text = item['uuid'] my_time.text = item['crawledTime'] my_url.text = item['url'] my_title.text = item['title'] my_fullText.text = item['fullText'] my_urlSHA1.text = item['urlSHA1'] my_titleSHA1.text = item['titleSHA1'] my_fullTextSHA1.text = item['fullTextSHA1'] out_str = ET.tostring(my_xml, 'utf-8') reparsed = minidom.parseString(out_str) out_str = reparsed.toprettyxml(indent=" ") if not isinstance(out_str, str): out_str = out_str.decode("UTF-8") return out_str def Item2Json(self, item): return json.dumps(dict(item))
def test_no_database_found(self): conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) db = Database.lookup("No-Such-Database", conn) self.assertIsNone(db)
__author__ = 'phoehne' from requests.auth import HTTPDigestAuth from marklogic.models import Database, Connection, Host, HttpServer, ElementRange, ElementAttributeRange conn = Connection("192.168.57.141", HTTPDigestAuth("admin", "admin")) server_hostname = hosts = Host.list_hosts(conn)[0].host_name() db = Database("test-one", server_hostname) db.create(conn).load_file(conn, "example_doc.json", "/test/document.json", ["example", "collection"]) modules = Database("test-one-modules", server_hostname) modules.create(conn) db = Database.lookup("test-one", conn) db.add_index(ElementRange("order-id", u'int')) db.add_index( ElementAttributeRange("customer", "id", scalar_type=u'int', element_namespace="http://foo.bar.com")) db.save(conn) srvr = HttpServer("test-one-http", 8400) srvr.set_content_database(db.config[u'database-name']).set_modules_database( modules.config[u'database-name']) srvr.create(conn) db.load_file(conn, "example_doc.json", "/example/file.json", ["test"])
__author__ = 'phoehne' from requests.auth import HTTPDigestAuth from marklogic.models import Database, Connection, Host, HttpServer, ElementRange, ElementAttributeRange conn = Connection("192.168.57.141", HTTPDigestAuth("admin", "admin")) server_hostname = hosts = Host.list_hosts(conn)[0].host_name() db = Database("test-one", server_hostname) db.create(conn).load_file(conn, "example_doc.json", "/test/document.json", ["example", "collection"]) modules = Database("test-one-modules", server_hostname) modules.create(conn) db = Database.lookup("test-one", conn) db.add_index(ElementRange("order-id", u'int')) db.add_index(ElementAttributeRange("customer", "id", scalar_type=u'int', element_namespace="http://foo.bar.com")) db.save(conn) srvr = HttpServer("test-one-http", 8400) srvr.set_content_database(db.config[u'database-name']).set_modules_database(modules.config[u'database-name']) srvr.create(conn) db.load_file(conn, "example_doc.json", "/example/file.json", ["test"]) db.load_directory_files(conn, "data", "/test/data/", ["test2"]) db.load_directory(conn, "data", collections=["this", "that"])