Beispiel #1
0
def setup_tables(filename, table_query, insert_query):
    # load testconfig
    credentials = dict(get_config("MARIADBX"))
    # setup database
    connector = MariaDb(credentials)
    connector.create_db(TESTDB)
    connector.connector.database = TESTDB
    connector.createTable("test dblp table", table_query)

    # setup test ingester database
    # setup_database(TESTDB)
    # import records from csv
    with open(filename, newline='', encoding='utf-8') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=';', quotechar='"')
        do_once = False
        for row in spamreader:
            # remove last updated and harvest date
            del row[-2:]
            # skip first line
            if do_once is True:
                tup = tuple(map(lambda x: x if x != "" else None, row))
                connector.execute_ex(insert_query, tup)
            else:
                do_once = True
    connector.close_connection()
def setup():
    NORMAL_TITLES = ("CREATE TABLE `Authors` ("
                     " `Id` INT NOT NULL,"
                     " `main_name` TEXT,"
                     " `normal_name` TEXT,"
                     " `metaphone_name` TEXT,"
                     "  PRIMARY KEY (`Id`)"
                     ") ENGINE= {} CHARSET=utf8mb4")
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    connector.create_database(DB_NAME)
    connector.createTable("dvfds", NORMAL_TITLES.format(storage_engine))
    try:
        connector.execute_ex(
            "CREATE FULLTEXT INDEX main_name_idx  ON Authors (main_name)", ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX normal_name_idx  ON Authors (normal_name)",
            ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX metaphone_name_idx  ON Authors (metaphone_name)",
            ())
    except:
        print("Index already exists")

    connector.close_connection()
Beispiel #3
0
 def test_create_table(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.createTable("test", ("CREATE TABLE `muhz` ("
                            "  `dblp_key` varchar(100) NOT NULL,"
                            "  PRIMARY KEY (`dblp_key`)"
                            ") ENGINE={} CHARSET=utf8mb4"))
     x.close_connection()
Beispiel #4
0
def setup(TABLE_NAME):
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    # create database
    connector.create_database(DB_NAME)
    connector.createTable(TABLE_NAME, TITLES.format(TABLE_NAME,storage_engine))
    connector.close_connection()
Beispiel #5
0
 def test_execute_ex(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.createTable("test", ("CREATE TABLE `muhz` ("
                            "  `ID` int NOT NULL AUTO_INCREMENT,"
                            "  `dblp_key` varchar(100) NOT NULL,"
                            "  PRIMARY KEY (`ID`)"
                            ") ENGINE={} CHARSET=utf8mb4"))
     idx = x.execute_ex("INSERT INTO muhz (dblp_key) VALUES (%s)", ('mi'))
     self.assertEqual(idx, 1)
     x.close_connection()
Beispiel #6
0
class TestIngesterMulti2(TransactionTestCase):
    fixtures = [os.path.join(ingester_path, "fixtures", "initial_data.json")]

    @classmethod
    def setUpClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex((
            "CREATE FULLTEXT INDEX cluster_ft_idx  ON test_storage.ingester_cluster (name)"
        ), ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)",
            ())
        connector.close_connection()

    @classmethod
    def tearDownClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex(
            "ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx",
            ())
        connector.execute_ex(
            "ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx",
            ())
        connector.close_connection()

    def setUp(self):
        self.connector = MariaDb(db="test_storage")
        storage_engine = get_config("MISC")["storage_engine"]
        # create tables for both sources arxiv and dblp
        self.connector.createTable("dblparticle",
                                   DBLP_ARTICLE.format(storage_engine))
        self.connector.createTable("arxivarticle",
                                   ARXIV_ARTICLE.format(storage_engine))
        # insert data
        dblp_article = (
            "dblpkey",  # key
            "2011-11-11",  # mdate
            "Andreas Anders;Bertha Theresa Balte;",  # authors
            "The Ultimate Title",  # title
            "10-14",  # pages
            datetime.date(2005, 1, 1),  # pub year
            "2",  # volume
            "journal of stuff",  # journal
            "3",  # journal number
            "http://google.de",  # doi
            "http://unused.com",  # unused url
            None,  # cite
            None,  # crossref
            None,  # booktitle
            None,  # school
            None,  # address
            None,  # publisher
            None,  # isbn
            None,  # series
            "article"  # type
        )

        arxiv_article = (
            "arxivkey",  # identifier
            "2007-07-07",  # created
            "2008-08-08",  # updated
            "Andreas Anders;Bertha Theresa Balte;Carim Chass Jr.;",  # authors
            "The Ultimate Title!",  # title
            None,  # mscclass
            None,  # acmclass
            None,  # reportno
            None,  # journalref
            None,  # comments
            "this is a test",  # description
            "category",  # categories
            "http://google.com",  # doi
            "2009-09-09"  # mdate
        )

        self.connector.execute_ex(ADD_DBLP_ARTICLE, dblp_article)
        self.connector.execute_ex(ADD_ARXIV, arxiv_article)

    def tearDown(self):
        self.connector.execute_ex("DROP TABLE test_storage.arxiv_articles")
        self.connector.execute_ex("DROP TABLE test_storage.dblp_article")
        self.connector.close_connection()

    def test_success_reversed(self):
        dblpingester = DblpIngester("dblp.ingester",
                                    harvesterdb="test_storage")
        arxivingester = ArxivIngester("arxiv.ingester",
                                      harvester_db="test_storage")

        # arxiv first then dblp
        result2 = ingest_data(arxivingester)
        self.assertEqual(result2, 1)
        result = ingest_data(dblpingester)
        self.assertEqual(result, 1)

        # check all tables
        self.assertEqual(cluster.objects.count(), 1)
        self.assertEqual(publication.objects.count(), 1)
        self.assertEqual(local_url.objects.count(), 3)
        self.assertEqual(global_url.objects.count(), 4)
        self.assertEqual(limbo_authors.objects.count(), 0)
        self.assertEqual(limbo_pub.objects.count(), 0)
        self.assertEqual(pub_medium.objects.count(), 1)
        # check local url
        dblp_url = local_url.objects.get(id=3)
        pub_url = local_url.objects.get(id=2)
        arxiv_url = local_url.objects.get(id=1)
        self.assertEqual(dblp_url.test(), [
            3, "dblpkey", 1,
            publication_type.objects.get(name="article").id, None
        ])
        self.assertEqual(arxiv_url.test(), [
            4, "arxivkey", None,
            publication_type.objects.get(name="misc").id, None
        ])
        self.assertEqual(pub_url.test(), [
            1, "TODO PLATZHALTER", 1,
            publication_type.objects.get(name="misc").id, None
        ])
        # check authors
        self.assertEqual(authors_model.objects.count(), 3)
        self.assertEqual(author_aliases.objects.count(), 3)
        self.assertEqual(author_alias_source.objects.count(), 5)
        # publication authors
        self.assertEqual(publication_author.objects.count(), 8)
        # check publication
        publ = publication.objects.first()
        self.assertEqual(publ.title, "The Ultimate Title!")  # from Arxiv
        self.assertEqual(publ.pages, "10-14")  # DBLP
        self.assertEqual(publ.note, None)
        self.assertEqual(publ.doi, "http://google.com")  # Arxiv
        self.assertEqual(publ.abstract, "this is a test")  # arxiv
        self.assertEqual(publ.copyright, None)
        self.assertEqual(publ.date_added, None)
        self.assertEqual(publ.date_published, datetime.date(2007, 1,
                                                            1))  # DBLP
        self.assertEqual(publ.volume, "2")  # DBLP
        self.assertEqual(publ.number, "3")  # DBLP
        # check diff tree
        diff = deserialize_diff_store(publ.differences)
        self.assertEqual(diff["url_id"], [1, 3])
        self.assertEqual(diff["doi"], [{
            "bitvector": 1,
            "votes": 0,
            "value": "http://google.com"
        }, {
            "bitvector": 2,
            "votes": 0,
            "value": "http://google.de"
        }])
        self.assertEqual(diff["copyright"], [])
        self.assertEqual(diff["type_ids"], [{
            "bitvector": 1,
            "votes": 0,
            "value": 2
        }, {
            "bitvector": 2,
            "votes": 0,
            "value": 1
        }])
        self.assertEqual(diff["pages"], [{
            "bitvector": 2,
            "votes": 0,
            "value": "10-14"
        }])

        self.assertEqual(OpenReferences.objects.first().test(),
                         [1, 'arxivkey', None])
Beispiel #7
0
class TestIngester(TransactionTestCase):
    fixtures = [os.path.join(ingester_path, "fixtures", "initial_data.json")]

    @classmethod
    def setUpClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex("CREATE FULLTEXT INDEX cluster_ft_idx  ON test_storage.ingester_cluster (name)", ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)", ())
        connector.close_connection()

    @classmethod
    def tearDownClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex("ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx", ())
        connector.execute_ex("ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx", ())
        connector.close_connection()

    def setUp(self):
        self.connector = MariaDb(db="test_storage")
        storage_engine = get_config("MISC")["storage_engine"]
        self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine))

    def tearDown(self):
        self.connector.execute_ex("DROP TABLE test_storage.dblp_article")
        self.connector.close_connection()

    def test_invalid_ingester(self):
        setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        self.assertRaises(IIngester_Exception, ingest_data, datetime.datetime(1990,1,1,1,1,1))

    def test_success(self):
        setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        self.assertEqual(ingester.get_global_url().id, 3)
        result = ingest_data(ingester)
        self.assertEqual(result, 2)
        # check local url
        self.assertEqual(local_url.objects.get(id=1).test(), [3, 'journals/acta/AkyildizB89', 1, 1,None])
        self.assertEqual(local_url.objects.get(id=2).test(), [1, 'TODO PLATZHALTER', 1, 1,None])
        self.assertEqual(local_url.objects.get(id=3).test(), [3, 'journals/acta/VoglerS014', 1, 1,None])
        self.assertEqual(local_url.objects.get(id=4).test(), [1, 'TODO PLATZHALTER', 1, 1,None])
        # check authors_model
        self.assertEqual(authors_model.objects.get(id=1).test(),["Ian F. Akyildiz", "ian f akyildiz"])
        self.assertEqual(authors_model.objects.get(id=2).test(), ["Horst von Brand", "horst von brand"])
        self.assertEqual(authors_model.objects.get(id=3).test(), ["Walter Vogler", "walter vogler"])
        self.assertEqual(authors_model.objects.get(id=4).test(), ["Christian Stahl", "christian stahl"])
        self.assertEqual(authors_model.objects.get(id=5).test(), ["Richard Müller", "richard muller"])
        # check author alias
        self.assertEqual(author_aliases.objects.get(id=1).test(), [1, "Ian F. Akyildiz"])
        self.assertEqual(author_aliases.objects.get(id=2).test(), [2, "Horst von Brand"])
        self.assertEqual(author_aliases.objects.get(id=3).test(), [3, "Walter Vogler"])
        self.assertEqual(author_aliases.objects.get(id=4).test(), [4, "Christian Stahl"])
        self.assertEqual(author_aliases.objects.get(id=5).test(), [5, "Richard Müller 0001"])
        self.assertEqual(author_aliases.objects.get(id=6).test(), [5, "Richard Müller"])
        # cluster
        self.assertEqual(cluster.objects.get(id=1).name, "bla bla bla")
        self.assertEqual(cluster.objects.get(id=2).name, "kam kim kum")
        # author alias source
        self.assertEqual(author_alias_source.objects.get(id=1).test(), [1, 1])
        self.assertEqual(author_alias_source.objects.get(id=2).test(), [2, 1])
        self.assertEqual(author_alias_source.objects.get(id=3).test(), [3, 3])
        self.assertEqual(author_alias_source.objects.get(id=4).test(), [4, 3])
        self.assertEqual(author_alias_source.objects.get(id=5).test(), [5, 3])
        self.assertEqual(author_alias_source.objects.get(id=6).test(), [6, 3])
        # publication authors
        self.assertEqual(publication_author.objects.get(id=1).test(), [1, 1, 0])
        self.assertEqual(publication_author.objects.get(id=2).test(), [1, 2, 1])
        self.assertEqual(publication_author.objects.get(id=3).test(), [2, 1, 0])
        self.assertEqual(publication_author.objects.get(id=4).test(), [2, 2, 1])
        self.assertEqual(publication_author.objects.get(id=5).test(), [3, 3, 0])
        self.assertEqual(publication_author.objects.get(id=6).test(), [3, 4, 1])
        self.assertEqual(publication_author.objects.get(id=7).test(), [3, 5, 2])
        self.assertEqual(publication_author.objects.get(id=8).test(), [4, 3, 0])
        self.assertEqual(publication_author.objects.get(id=9).test(), [4, 4, 1])
        self.assertEqual(publication_author.objects.get(id=10).test(), [4, 5, 2])

        # limbo
        self.assertEqual(limbo_authors.objects.count(),0)
        self.assertEqual(limbo_pub.objects.count(),0)

        # publication
        self.assertEqual(publication.objects.get(id=1).test(), [1, "Bla Bla Bla"])
        self.assertEqual(publication.objects.get(id=2).test(), [2, "Kam? Kim! Kum."])
        # check if last harvested is set
        tmp = list(get_table_data("dblp_article", null_dates=False))
        self.assertEqual(tmp[0][-1].strftime("%Y-%m-%d"), datetime.datetime.now().strftime("%Y-%m-%d"))

        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_success_limit(self):
        setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingester.set_limit(1)
        result = ingest_data(ingester)
        self.assertEqual(result, 1)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_complete_publication(self):
        # for this test a dataset with ALL ROWS filled, will be created to check if all values are
        # successfully transferred
        setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)
        publ = publication.objects.first()
        self.assertEqual(publ.title,"title")
        self.assertEqual(publ.pages, "1-5")
        self.assertEqual(publ.doi, "doi")
        self.assertEqual(publ.abstract, None)
        self.assertEqual(publ.copyright, None)
        self.assertEqual(publ.volume, "1")
        self.assertEqual(publ.number, "2")
        self.assertEqual(publ.note, None)
        self.assertEqual(publ.date_added, None)
        self.assertEqual(publ.date_published, datetime.date(1990,1,1))
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_limbo_multi_cluster(self):
        setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        cluster.objects.bulk_create([
            cluster(id=1, name="title"),
            cluster(id=2, name="title"),
        ])
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)
        self.assertEqual(limbo_authors.objects.get(id=1).test(), [1, 'None', "An Author", 0])
        self.assertEqual(limbo_authors.objects.get(id=2).test(), [1, 'None', "Another Author", 1])
        self.assertEqual(local_url.objects.count(),0)
        limbo = limbo_pub.objects.get(id=1).test_extended()
        print(limbo)
        compare = ['Reason.AMB_CLUSTER','key',"title","1-5",None,"doi",None,None,
                                None,datetime.date(1990,1,1),"1","2","series",
                                None,"publisher",None,"school","address",
                                "isbn",None,"booktitle","journal"]
        self.assertEqual(limbo,compare)

    def test_limbo_multi_pubs(self):
        setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        cl = cluster.objects.create(id=1, name="title")
        gurl = global_url.objects.create(id=5,domain ="http://dummy.de", url="http://dummy.de")
        lurl = local_url.objects.create(id=1,url="jlkjöl", global_url=gurl)
        publication.objects.bulk_create([
            publication(local_url=lurl,cluster=cl,title="Title"),
            publication(local_url=lurl, cluster=cl, title="Title")
        ])
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)
        limbo = limbo_pub.objects.get(id=1).test_extended()
        self.assertEqual(limbo[0],'Reason.AMB_PUB')
        self.assertEqual(limbo_authors.objects.get(id=1).test(), [1, 'None', "An Author", 0])
        self.assertEqual(limbo_authors.objects.get(id=2).test(), [1, 'None', "Another Author", 1])
        self.assertEqual(local_url.objects.count(),1)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_limbo_alias(self):
        setup_tables(os.path.join(test_path, "dblp_test3.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)

        self.assertEqual(limbo_pub.objects.count(), 0)
        self.assertEqual(cluster.objects.count(), 3)
        self.assertEqual(authors_model.objects.count(), 5)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_set_last_harvested(self):
        setup_tables(os.path.join(test_path, "dblp_test3.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingester.set_limit(1)
        result1 = ingest_data(ingester)
        self.assertEqual(result1, 1)
        ingester.set_limit(3)
        result2 = ingest_data(ingester)
        self.assertEqual(result2, 2)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)
Beispiel #8
0
from oai.queries import OAI_DATASET
from mysqlWrapper.mariadb import MariaDb

DB_NAME = 'oaimph'
credentials = {
    'user': '******',
    'password': '******',
    'host': '127.0.0.1',
}

try:
    database = MariaDb(credentials)
except Exception as err:
    print(err)
else:
    database.create_db(DB_NAME)
    database.createTable("oaimph", OAI_DATASET)
    database.close_connection()



Beispiel #9
0
def setup():
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()

    storage_engine = get_config("MISC")["storage_engine"]

    # create database
    connector.create_database(DB_NAME)
    connector.createTable("dates", DATE_TABLE.format(storage_engine))
    connector.createTable("publication year", PUB_YEAR_TABLE.format(storage_engine))
    connector.createTable("popular_words", POPULAR.format(storage_engine))
    connector.createTable("title_length", TITLE_LENGTH.format(storage_engine))
    connector.createTable("popular names", N_POPULAR.format(storage_engine))
    connector.createTable("number authors", NUM_AUTHOR.format(storage_engine))
    connector.createTable("Authors", AUTHORS.format(storage_engine))
    connector.createTable("Normal Titles", NORMAL_TITLES.format(storage_engine))
    connector.createTable("Career",CAREER.format(storage_engine))
    # create index
    try:
        connector.execute_ex("CREATE FULLTEXT INDEX title_idx  ON normal_title (titles)", ())
    except:
        print("Index already exists")

    connector.close_connection()