def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list(conn) db = Database("simple-forest-create-test-db", hosts[0]) db.set_forest_names( ["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create(conn) db = Database.lookup(conn, "simple-forest-create-test-db") try: self.assertEqual(2, len(db.forest_names())) self.assertIn("simple-forest-create-forest1", db.forest_names()) self.assertIn("simple-forest-create-forest2", db.forest_names()) finally: db.delete(conn)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list(conn) db = Database("detailed-forest-create-test-db", hosts[0]) forest = Forest("detailed-forest-create-forest1", host=hosts[0], large_data_directory=ds.large_data_directory) db.set_forest_names([forest.forest_name()]) db.create(conn) forest = Forest.lookup(conn, "detailed-forest-create-forest1") try: self.assertEqual("detailed-forest-create-forest1", forest.forest_name()) self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.delete(conn)
def test_simple_create(self): """ TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list(conn) db = Database("test-db", hosts[0]) db.create(conn) validate_db = Database.lookup(conn, "test-db") try: self.assertIsNotNone(validate_db) self.assertEqual('test-db', validate_db.database_name()) finally: validate_db.delete(conn) validate_db = Database.lookup(conn, "test-db") self.assertIsNone(validate_db)
def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list_hosts(conn) db = Database("simple-forest-create-test-db", hosts[0].host_name()) db.set_forests(["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create(conn) db = Database.lookup("simple-forest-create-test-db", conn) try: self.assertEqual(2, len(db.forests())) self.assertIn("simple-forest-create-forest1", db.forests()) self.assertIn("simple-forest-create-forest2", db.forests()) finally: db.remove(conn)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list_hosts(conn) db = Database("detailed-forest-create-test-db", hosts[0].host_name()) forest = Forest("detailed-forest-create-forest1", host=hosts[0].host_name(), large_data_directory=ds.large_data_directory) db.set_forests([forest]) db.create(conn) forest = Forest.lookup("detailed-forest-create-forest1", conn) try: self.assertEqual("detailed-forest-create-forest1", forest.name()) self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.remove(conn)
def test_simple_create(self): """ TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password)) hosts = Host.list_hosts(conn) db = Database("test-db", hosts[0].host_name()) db.create(conn) validate_db = Database.lookup("test-db", conn) try: self.assertIsNotNone(validate_db) self.assertEqual('test-db', validate_db.database_name()) finally: validate_db.remove(conn) validate_db = Database.lookup("test-db", conn) self.assertIsNone(validate_db)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ hosts = Host.list(self.connection) db = Database("detailed-forest-create-test-db", hosts[0]) forest = Forest("detailed-forest-create-forest1", host=hosts[0]) forest.set_large_data_directory("") db.set_forest_names([forest.forest_name()]) db.create(self.connection) forest = Forest.lookup(self.connection, "detailed-forest-create-forest1") try: assert "detailed-forest-create-forest1" == forest.forest_name() # this isn't in the properties...oddly. # self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.delete(connection=self.connection)
def test_create_single_detailed_forest(self): """ Test the following scenario: The database is given a forest object. It should create a forest with the given name. That forest should match the features of the datailed forest. """ hosts = Host.list(self.connection) db = Database("detailed-forest-create-test-db", hosts[0]) forest = Forest("detailed-forest-create-forest1", host=hosts[0]) forest.set_large_data_directory("") db.set_forest_names([forest.forest_name()]) db.create(self.connection) forest = Forest.lookup(self.connection, "detailed-forest-create-forest1") try: assert "detailed-forest-create-forest1" == forest.forest_name() #this isn't in the properties...oddly. #self.assertEqual(ds.large_data_directory, forest.large_data_directory()) finally: db.delete(connection=self.connection)
def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ hosts = Host.list(self.connection) db = Database("simple-forest-create-test-db", hosts[0], connection=self.connection) db.set_forest_names( ["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create() db = Database.lookup(self.connection, "simple-forest-create-test-db") try: assert 2 == len(db.forest_names()) assert "simple-forest-create-forest1" in db.forest_names() assert "simple-forest-create-forest2" in db.forest_names() finally: db.delete(connection=self.connection)
def test_create_simple_forests(self): """ Test the following scenario: The database is given the names of two forests. It should then create the two named forests. """ hosts = Host.list(self.connection) db = Database("simple-forest-create-test-db", hosts[0], connection=self.connection) db.set_forest_names(["simple-forest-create-forest1", "simple-forest-create-forest2"]) db.create() db = Database.lookup(self.connection, "simple-forest-create-test-db") try: assert 2 == len(db.forest_names()) assert "simple-forest-create-forest1" in db.forest_names() assert "simple-forest-create-forest2" in db.forest_names() finally: db.delete(connection=self.connection)
def test_simple_create(self): """TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ hosts = Host.list(self.connection) db = Database("test-db", hosts[0]) db.create(self.connection) validate_db = Database.lookup(self.connection, "test-db") try: assert validate_db is not None assert "test-db" == validate_db.database_name() finally: validate_db.delete(connection=self.connection) validate_db = Database.lookup(self.connection, "test-db") assert validate_db is None
def test_simple_create(self): """TODO: The hostname should come from the server's hostname Test the basic create function. Creates a database and then check to see that it exists by getting the database configuration from the server. It then destroys the database. :return: None """ hosts = Host.list(self.connection) db = Database("test-db", hosts[0]) db.create(self.connection) validate_db = Database.lookup(self.connection, "test-db") try: assert validate_db is not None assert 'test-db' == validate_db.database_name() finally: validate_db.delete(connection=self.connection) validate_db = Database.lookup(self.connection, "test-db") assert validate_db is None
# # Copyright 2015 MarkLogic Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0# # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from marklogic.models import Connection, Database, FieldRange, Field, FieldReference from requests.auth import HTTPDigestAuth auth = HTTPDigestAuth("admin", "admin") conn = Connection("192.168.57.141", auth) db = Database("range-field-test", "localhost.localdomain") field = Field("test-field", includes=[FieldReference("http://foo.bar.com/invoice", "id")]) db.add_field(field) db.add_index(FieldRange("test-field", "int")) db.create(conn)
__author__ = 'phoehne' from requests.auth import HTTPDigestAuth from marklogic.models import Database, Connection, Host, HttpServer, ElementRange, ElementAttributeRange conn = Connection("192.168.57.141", HTTPDigestAuth("admin", "admin")) server_hostname = hosts = Host.list_hosts(conn)[0].host_name() db = Database("test-one", server_hostname) db.create(conn).load_file(conn, "example_doc.json", "/test/document.json", ["example", "collection"]) modules = Database("test-one-modules", server_hostname) modules.create(conn) db = Database.lookup("test-one", conn) db.add_index(ElementRange("order-id", u'int')) db.add_index( ElementAttributeRange("customer", "id", scalar_type=u'int', element_namespace="http://foo.bar.com")) db.save(conn) srvr = HttpServer("test-one-http", 8400) srvr.set_content_database(db.config[u'database-name']).set_modules_database( modules.config[u'database-name']) srvr.create(conn) db.load_file(conn, "example_doc.json", "/example/file.json", ["test"])
__author__ = 'phoehne' from requests.auth import HTTPDigestAuth from marklogic.models import Database, Connection, Host, HttpServer, ElementRange, ElementAttributeRange conn = Connection("192.168.57.141", HTTPDigestAuth("admin", "admin")) server_hostname = hosts = Host.list_hosts(conn)[0].host_name() db = Database("test-one", server_hostname) db.create(conn).load_file(conn, "example_doc.json", "/test/document.json", ["example", "collection"]) modules = Database("test-one-modules", server_hostname) modules.create(conn) db = Database.lookup("test-one", conn) db.add_index(ElementRange("order-id", u'int')) db.add_index(ElementAttributeRange("customer", "id", scalar_type=u'int', element_namespace="http://foo.bar.com")) db.save(conn) srvr = HttpServer("test-one-http", 8400) srvr.set_content_database(db.config[u'database-name']).set_modules_database(modules.config[u'database-name']) srvr.create(conn) db.load_file(conn, "example_doc.json", "/example/file.json", ["test"]) db.load_directory_files(conn, "data", "/test/data/", ["test2"]) db.load_directory(conn, "data", collections=["this", "that"])
class ProcessingPipeline(object): def __init__(self): self.name = settings['BOT_NAME'] self.OUTFolder = settings['OUT_FOLDER'] log.msg("---{0} OUT_FOLDER={1}".format(self.name, self.OUTFolder), level=log.DEBUG) if not os.path.exists(self.OUTFolder): os.makedirs(self.OUTFolder) self.fmt = settings['MARKLOGIC_FORMAT'] self.DO_MarkLogic = settings['MARKLOGIC_UPLOAD'] if self.DO_MarkLogic: self.KeepOrig = settings['MARKLOGIC_KEEP_ORIGINAL_FILE'] self.ML_Host = settings['MARKLOGIC_HOSTNAME'] log.msg("---{0} MARKLOGIC_HOSTNAME={1}".format(self.name, self.ML_Host), level=log.DEBUG) self.admin = settings['MARKLOGIC_ADMIN'] self.password = settings['MARKLOGIC_PASSWORD'] self.db_name = settings['MARKLOGIC_DB'] log.msg("---{0}: MarkLogic Database initialization: {1}".format(self.name, self.db_name), level=log.DEBUG) self.conn = Connection(self.ML_Host, HTTPDigestAuth(self.admin, self.password)) self.db = Database.lookup(self.conn, self.db_name) if self.db == None: log.msg("---{0}: MarkLogic Database {1} does not exist, creating on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG) hosts = Host.list(self.conn) self.db = Database(self.db_name, hosts[0]) self.db.create(self.conn) else: log.msg("---{0}: MarkLogic Database {1} already exists on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG) def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: file_path = os.path.join(self.OUTFolder, item['urlSHA1']) + '.' + self.fmt if not os.path.isfile(file_path): with open(file_path, 'w') as f: if self.fmt.lower() == 'xml': out_str = self.Item2XML(item) elif self.fmt.lower() == 'json': out_str = self.Item2Json(item) else: raise DropItem("---{0}: invalid MarkLogic data format {1}!".format(self.fmt)) f.write(out_str) log.msg("---{0} Crawl Results were written to {1} file!".format(self.name, self.fmt), level=log.DEBUG, spider=spider) if self.DO_MarkLogic: ML_DocURI = "/{0}CrawlingResults/{1}.{2}".format(self.name, item['urlSHA1'], self.fmt) log.msg("---{0} Crawl MarkLogic Pipeline. Trying to ingest {1}".format(self.name, ML_DocURI), level=log.DEBUG, spider=spider) coll_name = "{0}CrawlingResults".format(self.name) self.db.load_file(self.conn, file_path, ML_DocURI, [ coll_name ]) log.msg("---{0} Crawl Results were written to MarkLogic database!".format(self.name), level=log.DEBUG, spider=spider) if not self.KeepOrig: os.remove(file_path) else: log.msg("---{0} Crawl duplicate url results found, ignoring!".format(self.name), level=log.DEBUG, spider=spider) return item def Item2XML (self, item): root_name = "{0}CrawlingResult".format(self.name) my_xml = ET.Element( root_name ) my_id = ET.SubElement(my_xml, 'uuid') my_time = ET.SubElement(my_xml, 'crawledTime') my_url = ET.SubElement(my_xml, 'url') my_title = ET.SubElement(my_xml, 'title') my_fullText = ET.SubElement(my_xml, 'fullText') my_titleSHA1 = ET.SubElement(my_xml, 'titleSHA1') my_urlSHA1 = ET.SubElement(my_xml, 'urlSHA1') my_fullTextSHA1 = ET.SubElement(my_xml, 'fullTextSHA1') my_id.text = item['uuid'] my_time.text = item['crawledTime'] my_url.text = item['url'] my_title.text = item['title'] my_fullText.text = item['fullText'] my_urlSHA1.text = item['urlSHA1'] my_titleSHA1.text = item['titleSHA1'] my_fullTextSHA1.text = item['fullTextSHA1'] out_str = ET.tostring(my_xml, 'utf-8') reparsed = minidom.parseString(out_str) out_str = reparsed.toprettyxml(indent=" ") if not isinstance(out_str, str): out_str = out_str.decode("UTF-8") return out_str def Item2Json(self, item): return json.dumps(dict(item))