Esempio n. 1
0
    def test_create_simple_forests(self):
        """
        Test the following scenario:

        The database is given the names of two forests.
        It should then create the two named forests.

        """
        conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password))

        hosts = Host.list(conn)
        db = Database("simple-forest-create-test-db", hosts[0])

        db.set_forest_names(
            ["simple-forest-create-forest1", "simple-forest-create-forest2"])

        db.create(conn)

        db = Database.lookup(conn, "simple-forest-create-test-db")
        try:
            self.assertEqual(2, len(db.forest_names()))

            self.assertIn("simple-forest-create-forest1", db.forest_names())
            self.assertIn("simple-forest-create-forest2", db.forest_names())

        finally:
            db.delete(conn)
Esempio n. 2
0
    def test_create_single_detailed_forest(self):
        """
        Test the following scenario:

        The database is given a forest object.  It should create a forest with
        the given name.  That forest should match the features of the datailed
        forest.

        """

        conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password))

        hosts = Host.list(conn)
        db = Database("detailed-forest-create-test-db", hosts[0])

        forest = Forest("detailed-forest-create-forest1",
                        host=hosts[0],
                        large_data_directory=ds.large_data_directory)

        db.set_forest_names([forest.forest_name()])

        db.create(conn)

        forest = Forest.lookup(conn, "detailed-forest-create-forest1")

        try:
            self.assertEqual("detailed-forest-create-forest1",
                             forest.forest_name())
            self.assertEqual(ds.large_data_directory,
                             forest.large_data_directory())
        finally:
            db.delete(conn)
Esempio n. 3
0
    def test_simple_create(self):
        """
        TODO: The hostname should come from the server's hostname

        Test the basic create function.  Creates a database and then check to see that it
        exists by getting the database configuration from the server.  It then destroys
        the database.

        :return: None
        """
        conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password))
        hosts = Host.list(conn)
        db = Database("test-db", hosts[0])

        db.create(conn)

        validate_db = Database.lookup(conn, "test-db")
        try:
            self.assertIsNotNone(validate_db)
            self.assertEqual('test-db', validate_db.database_name())

        finally:
            validate_db.delete(conn)
            validate_db = Database.lookup(conn, "test-db")
            self.assertIsNone(validate_db)
Esempio n. 4
0
    def test_create_simple_forests(self):
        """
        Test the following scenario:

        The database is given the names of two forests.
        It should then create the two named forests.

        """
        conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password))

        hosts = Host.list_hosts(conn)
        db = Database("simple-forest-create-test-db", hosts[0].host_name())

        db.set_forests(["simple-forest-create-forest1", "simple-forest-create-forest2"])

        db.create(conn)

        db = Database.lookup("simple-forest-create-test-db", conn)
        try:
            self.assertEqual(2, len(db.forests()))

            self.assertIn("simple-forest-create-forest1", db.forests())
            self.assertIn("simple-forest-create-forest2", db.forests())

        finally:
            db.remove(conn)
Esempio n. 5
0
    def test_create_single_detailed_forest(self):
        """
        Test the following scenario:

        The database is given a forest object.  It should create a forest with
        the given name.  That forest should match the features of the datailed
        forest.

        """

        conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password))

        hosts = Host.list_hosts(conn)
        db = Database("detailed-forest-create-test-db", hosts[0].host_name())

        forest = Forest("detailed-forest-create-forest1", host=hosts[0].host_name(),
                        large_data_directory=ds.large_data_directory)

        db.set_forests([forest])

        db.create(conn)

        forest = Forest.lookup("detailed-forest-create-forest1", conn)

        try:
            self.assertEqual("detailed-forest-create-forest1", forest.name())
            self.assertEqual(ds.large_data_directory, forest.large_data_directory())
        finally:
            db.remove(conn)
Esempio n. 6
0
    def test_simple_create(self):
        """
        TODO: The hostname should come from the server's hostname

        Test the basic create function.  Creates a database and then check to see that it
        exists by getting the database configuration from the server.  It then destroys
        the database.

        :return: None
        """
        conn = Connection(tc.hostname, HTTPDigestAuth(tc.admin, tc.password))
        hosts = Host.list_hosts(conn)
        db = Database("test-db", hosts[0].host_name())

        db.create(conn)

        validate_db = Database.lookup("test-db", conn)
        try:
            self.assertIsNotNone(validate_db)
            self.assertEqual('test-db', validate_db.database_name())

        finally:
            validate_db.remove(conn)
            validate_db = Database.lookup("test-db", conn)
            self.assertIsNone(validate_db)
Esempio n. 7
0
    def test_create_single_detailed_forest(self):
        """
        Test the following scenario:

        The database is given a forest object.  It should create a forest with
        the given name.  That forest should match the features of the datailed
        forest.

        """

        hosts = Host.list(self.connection)
        db = Database("detailed-forest-create-test-db", hosts[0])

        forest = Forest("detailed-forest-create-forest1", host=hosts[0])
        forest.set_large_data_directory("")

        db.set_forest_names([forest.forest_name()])

        db.create(self.connection)

        forest = Forest.lookup(self.connection, "detailed-forest-create-forest1")

        try:
            assert "detailed-forest-create-forest1" == forest.forest_name()
            # this isn't in the properties...oddly.
            # self.assertEqual(ds.large_data_directory, forest.large_data_directory())
        finally:
            db.delete(connection=self.connection)
    def test_create_single_detailed_forest(self):
        """
        Test the following scenario:

        The database is given a forest object.  It should create a forest with
        the given name.  That forest should match the features of the datailed
        forest.

        """

        hosts = Host.list(self.connection)
        db = Database("detailed-forest-create-test-db", hosts[0])

        forest = Forest("detailed-forest-create-forest1", host=hosts[0])
        forest.set_large_data_directory("")

        db.set_forest_names([forest.forest_name()])

        db.create(self.connection)

        forest = Forest.lookup(self.connection,
                               "detailed-forest-create-forest1")

        try:
            assert "detailed-forest-create-forest1" == forest.forest_name()
            #this isn't in the properties...oddly.
            #self.assertEqual(ds.large_data_directory, forest.large_data_directory())
        finally:
            db.delete(connection=self.connection)
    def test_create_simple_forests(self):
        """
        Test the following scenario:

        The database is given the names of two forests.
        It should then create the two named forests.

        """
        hosts = Host.list(self.connection)
        db = Database("simple-forest-create-test-db",
                      hosts[0],
                      connection=self.connection)

        db.set_forest_names(
            ["simple-forest-create-forest1", "simple-forest-create-forest2"])

        db.create()

        db = Database.lookup(self.connection, "simple-forest-create-test-db")
        try:
            assert 2 == len(db.forest_names())
            assert "simple-forest-create-forest1" in db.forest_names()
            assert "simple-forest-create-forest2" in db.forest_names()
        finally:
            db.delete(connection=self.connection)
Esempio n. 10
0
    def test_create_simple_forests(self):
        """
        Test the following scenario:

        The database is given the names of two forests.
        It should then create the two named forests.

        """
        hosts = Host.list(self.connection)
        db = Database("simple-forest-create-test-db", hosts[0], connection=self.connection)

        db.set_forest_names(["simple-forest-create-forest1", "simple-forest-create-forest2"])

        db.create()

        db = Database.lookup(self.connection, "simple-forest-create-test-db")
        try:
            assert 2 == len(db.forest_names())
            assert "simple-forest-create-forest1" in db.forest_names()
            assert "simple-forest-create-forest2" in db.forest_names()
        finally:
            db.delete(connection=self.connection)
Esempio n. 11
0
    def test_simple_create(self):
        """TODO: The hostname should come from the server's hostname

        Test the basic create function. Creates a database and then
        check to see that it exists by getting the database
        configuration from the server. It then destroys the database.

        :return: None

        """
        hosts = Host.list(self.connection)
        db = Database("test-db", hosts[0])

        db.create(self.connection)

        validate_db = Database.lookup(self.connection, "test-db")
        try:
            assert validate_db is not None
            assert "test-db" == validate_db.database_name()

        finally:
            validate_db.delete(connection=self.connection)
            validate_db = Database.lookup(self.connection, "test-db")
            assert validate_db is None
    def test_simple_create(self):
        """TODO: The hostname should come from the server's hostname

        Test the basic create function. Creates a database and then
        check to see that it exists by getting the database
        configuration from the server. It then destroys the database.

        :return: None

        """
        hosts = Host.list(self.connection)
        db = Database("test-db", hosts[0])

        db.create(self.connection)

        validate_db = Database.lookup(self.connection, "test-db")
        try:
            assert validate_db is not None
            assert 'test-db' == validate_db.database_name()

        finally:
            validate_db.delete(connection=self.connection)
            validate_db = Database.lookup(self.connection, "test-db")
            assert validate_db is None
Esempio n. 13
0
#
# Copyright 2015 MarkLogic Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from marklogic.models import Connection, Database, FieldRange, Field, FieldReference
from requests.auth import HTTPDigestAuth

auth = HTTPDigestAuth("admin", "admin")

conn = Connection("192.168.57.141", auth)

db = Database("range-field-test", "localhost.localdomain")
field = Field("test-field", includes=[FieldReference("http://foo.bar.com/invoice", "id")])
db.add_field(field)
db.add_index(FieldRange("test-field", "int"))

db.create(conn)
Esempio n. 14
0
__author__ = 'phoehne'

from requests.auth import HTTPDigestAuth
from marklogic.models import Database, Connection, Host, HttpServer, ElementRange, ElementAttributeRange

conn = Connection("192.168.57.141", HTTPDigestAuth("admin", "admin"))

server_hostname = hosts = Host.list_hosts(conn)[0].host_name()

db = Database("test-one", server_hostname)
db.create(conn).load_file(conn, "example_doc.json", "/test/document.json",
                          ["example", "collection"])

modules = Database("test-one-modules", server_hostname)
modules.create(conn)

db = Database.lookup("test-one", conn)
db.add_index(ElementRange("order-id", u'int'))
db.add_index(
    ElementAttributeRange("customer",
                          "id",
                          scalar_type=u'int',
                          element_namespace="http://foo.bar.com"))
db.save(conn)

srvr = HttpServer("test-one-http", 8400)
srvr.set_content_database(db.config[u'database-name']).set_modules_database(
    modules.config[u'database-name'])
srvr.create(conn)

db.load_file(conn, "example_doc.json", "/example/file.json", ["test"])
Esempio n. 15
0
__author__ = 'phoehne'

from requests.auth import HTTPDigestAuth
from marklogic.models import Database, Connection, Host, HttpServer, ElementRange, ElementAttributeRange

conn = Connection("192.168.57.141", HTTPDigestAuth("admin", "admin"))

server_hostname = hosts = Host.list_hosts(conn)[0].host_name()

db = Database("test-one", server_hostname)
db.create(conn).load_file(conn, "example_doc.json", "/test/document.json", ["example", "collection"])

modules = Database("test-one-modules", server_hostname)
modules.create(conn)

db = Database.lookup("test-one", conn)
db.add_index(ElementRange("order-id", u'int'))
db.add_index(ElementAttributeRange("customer", "id", scalar_type=u'int', element_namespace="http://foo.bar.com"))
db.save(conn)

srvr = HttpServer("test-one-http", 8400)
srvr.set_content_database(db.config[u'database-name']).set_modules_database(modules.config[u'database-name'])
srvr.create(conn)

db.load_file(conn, "example_doc.json", "/example/file.json", ["test"])
db.load_directory_files(conn, "data", "/test/data/", ["test2"])
db.load_directory(conn, "data", collections=["this", "that"])
class ProcessingPipeline(object):
    def __init__(self):
        self.name = settings['BOT_NAME']
        self.OUTFolder = settings['OUT_FOLDER']
        log.msg("---{0} OUT_FOLDER={1}".format(self.name, self.OUTFolder), level=log.DEBUG)
        if not os.path.exists(self.OUTFolder):
            os.makedirs(self.OUTFolder)

        self.fmt = settings['MARKLOGIC_FORMAT']
        self.DO_MarkLogic = settings['MARKLOGIC_UPLOAD']
        if self.DO_MarkLogic:
            self.KeepOrig = settings['MARKLOGIC_KEEP_ORIGINAL_FILE']
            self.ML_Host = settings['MARKLOGIC_HOSTNAME']
            log.msg("---{0} MARKLOGIC_HOSTNAME={1}".format(self.name, self.ML_Host), level=log.DEBUG)
            self.admin = settings['MARKLOGIC_ADMIN']
            self.password = settings['MARKLOGIC_PASSWORD']
            self.db_name = settings['MARKLOGIC_DB']
            log.msg("---{0}: MarkLogic Database initialization: {1}".format(self.name, self.db_name), level=log.DEBUG)
            self.conn = Connection(self.ML_Host, HTTPDigestAuth(self.admin, self.password))
            self.db = Database.lookup(self.conn, self.db_name)
            if self.db == None:
                log.msg("---{0}: MarkLogic Database {1} does not exist, creating on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG)
                hosts = Host.list(self.conn)
                self.db = Database(self.db_name, hosts[0])
                self.db.create(self.conn)
            else:
                log.msg("---{0}: MarkLogic Database {1} already exists on host: {2}.".format(self.name, self.db_name, self.ML_Host), level=log.DEBUG)                    
    
    def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            file_path = os.path.join(self.OUTFolder, item['urlSHA1']) + '.' + self.fmt
            if not os.path.isfile(file_path):
                with open(file_path, 'w') as f:
                    if self.fmt.lower() == 'xml':
                        out_str = self.Item2XML(item)
                    elif self.fmt.lower() == 'json':
                        out_str = self.Item2Json(item)
                    else:
                        raise DropItem("---{0}: invalid MarkLogic data format {1}!".format(self.fmt))
                        
                    f.write(out_str)
                    log.msg("---{0} Crawl Results were written to {1} file!".format(self.name, self.fmt), level=log.DEBUG, spider=spider)

                if self.DO_MarkLogic:
                    ML_DocURI = "/{0}CrawlingResults/{1}.{2}".format(self.name, item['urlSHA1'], self.fmt)
                    log.msg("---{0} Crawl MarkLogic Pipeline. Trying to ingest {1}".format(self.name, ML_DocURI), level=log.DEBUG, spider=spider)
                    coll_name = "{0}CrawlingResults".format(self.name)
                    self.db.load_file(self.conn, file_path, ML_DocURI, [ coll_name ])
                    log.msg("---{0} Crawl Results were written to MarkLogic database!".format(self.name), level=log.DEBUG, spider=spider)
                    if not self.KeepOrig:
                        os.remove(file_path)
            else:
                log.msg("---{0} Crawl duplicate url results found, ignoring!".format(self.name), level=log.DEBUG, spider=spider)
            
        return item

    def Item2XML (self, item):
        root_name = "{0}CrawlingResult".format(self.name)
        my_xml = ET.Element( root_name )
        my_id = ET.SubElement(my_xml, 'uuid')
        my_time = ET.SubElement(my_xml, 'crawledTime')
        my_url = ET.SubElement(my_xml, 'url')
        my_title = ET.SubElement(my_xml, 'title')
        my_fullText = ET.SubElement(my_xml, 'fullText')
        my_titleSHA1 = ET.SubElement(my_xml, 'titleSHA1')
        my_urlSHA1 = ET.SubElement(my_xml, 'urlSHA1')
        my_fullTextSHA1 = ET.SubElement(my_xml, 'fullTextSHA1')

        my_id.text = item['uuid']
        my_time.text = item['crawledTime']
        my_url.text = item['url']
        my_title.text = item['title']
        my_fullText.text = item['fullText']
        my_urlSHA1.text = item['urlSHA1']
        my_titleSHA1.text = item['titleSHA1']
        my_fullTextSHA1.text = item['fullTextSHA1']
        out_str = ET.tostring(my_xml, 'utf-8')
        reparsed = minidom.parseString(out_str)
        out_str = reparsed.toprettyxml(indent="  ")
        if not isinstance(out_str, str):
            out_str = out_str.decode("UTF-8")
        return out_str

    def Item2Json(self, item):
        return json.dumps(dict(item))