Esempio n. 1
0
def create_clusters():
    gurl = global_url.objects.get(id=1)
    open_ref_list = {}
    with open(os.path.join(file_path,'ref.log'),encoding="utf8") as f:
        for line in f:
            re.findall(r'alpha(.*?)bravo', line)
            if 'SOURCE' in line:
                regex= r'SOURCE (.*?):'
            else:
                regex = r'REF (.*?):'
            try:
                id_match = re.findall(regex, line)[0]
                title = line[line.index(":")+1:]
            except:
                continue

            # normalize title and transform 8 byte hex number to int
            normal_title = normalize_title(title)
            normal_id  = int(id_match, 16)
            # insert into cluster
            #cluster.objects.get_or_create(id= normal_id,defaults={"name":normal_title})
            # create local urls for matching titles

            if id_match in match_list:
                index = match_list.index(id_match)
                lurl, tmp =local_url.objects.get_or_create(id=normal_id,global_url=gurl, url=id_match)
                # creat open reference
                opref,tmp =OpenReferences.objects.get_or_create(id=normal_id,ingester_key=lurl,source_table=666,source_key=id_match)
                open_ref_list[id_match] = opref

    # creates single references directly from pdf:
    logger = logging.getLogger("PDFDownloader")
    logger.setLevel(logging.INFO)
    # create the logging file handler
    log_file = os.path.join(file_path, "pdf_downloader.log")
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    # add handler to logger object
    logger.addHandler(fh)
    # run actual task
    tmp = get_config("FOLDERS")["tmp"]
    grobid = get_config("WEAVER")["grobid"]
    limit = 20
    if limit is None:
        limit = int(get_config("WEAVER")["pdf_limit"])
    obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit)
    for element in match_list:
        pdf_path = os.path.join("C:\\Users\\anhtu\\Google Drive\\Informatik 2016\\Related Work\\evaluation",
                                "{}.pdf".format(element))
        obj.parse_references(pdf_path, open_ref_list[element])
Esempio n. 2
0
def setup():
    NORMAL_TITLES = ("CREATE TABLE `Authors` ("
                     " `Id` INT NOT NULL,"
                     " `main_name` TEXT,"
                     " `normal_name` TEXT,"
                     " `metaphone_name` TEXT,"
                     "  PRIMARY KEY (`Id`)"
                     ") ENGINE= {} CHARSET=utf8mb4")
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    connector.create_database(DB_NAME)
    connector.createTable("dvfds", NORMAL_TITLES.format(storage_engine))
    try:
        connector.execute_ex(
            "CREATE FULLTEXT INDEX main_name_idx  ON Authors (main_name)", ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX normal_name_idx  ON Authors (normal_name)",
            ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX metaphone_name_idx  ON Authors (metaphone_name)",
            ())
    except:
        print("Index already exists")

    connector.close_connection()
Esempio n. 3
0
def setup_tables(filename, table_query, insert_query):
    # load testconfig
    credentials = dict(get_config("MARIADBX"))
    # setup database
    connector = MariaDb(credentials)
    connector.create_db(TESTDB)
    connector.connector.database = TESTDB
    connector.createTable("test dblp table", table_query)

    # setup test ingester database
    # setup_database(TESTDB)
    # import records from csv
    with open(filename, newline='', encoding='utf-8') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=';', quotechar='"')
        do_once = False
        for row in spamreader:
            # remove last updated and harvest date
            del row[-2:]
            # skip first line
            if do_once is True:
                tup = tuple(map(lambda x: x if x != "" else None, row))
                connector.execute_ex(insert_query, tup)
            else:
                do_once = True
    connector.close_connection()
Esempio n. 4
0
def setup():
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()

    storage_engine = get_config("MISC")["storage_engine"]

    # create database
    connector.create_database(DB_NAME)
    connector.createTable("dates", DATE_TABLE.format(storage_engine))
    connector.createTable("publication year", PUB_YEAR_TABLE.format(storage_engine))
    connector.createTable("popular_words", POPULAR.format(storage_engine))
    connector.createTable("title_length", TITLE_LENGTH.format(storage_engine))
    connector.createTable("popular names", N_POPULAR.format(storage_engine))
    connector.createTable("number authors", NUM_AUTHOR.format(storage_engine))
    connector.createTable("Authors", AUTHORS.format(storage_engine))
    connector.createTable("Normal Titles", NORMAL_TITLES.format(storage_engine))
    connector.createTable("Career",CAREER.format(storage_engine))
    # create index
    try:
        connector.execute_ex("CREATE FULLTEXT INDEX title_idx  ON normal_title (titles)", ())
    except:
        print("Index already exists")

    connector.close_connection()
Esempio n. 5
0
def referencertask(limit=None):
    parameter_list = "[]"
    matches = 0
    active_queue = app.control.inspect().active()["celery@bremen"]
    for active_task in active_queue:
        if active_task["args"] == parameter_list:
            matches += 1
    if matches > 1:
        print("Referencer Task is already running, skipping execution")
        return None

    # create logger
    logger = logging.getLogger("Referencer")
    logger.setLevel(logging.INFO)
    # create the logging file handler
    log_file = os.path.join(log_dir, "referencer.log")
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    # add handler to logger object
    logger.addHandler(fh)
    logger.info("Start Referencer")

    if limit is None:
        limit = int(get_config("WEAVER")["referencer_limit"])

    ref = Referencer(limit, logger=logger)
    ref.run()
    logger.info("Finished Referencer")
Esempio n. 6
0
    def __init__(self, credentials=None, db=None):
        self.query = None
        if credentials is None:
            credentials = dict(get_config("MARIADBX"))
            self.storage_engine = get_config("MISC")["storage_engine"]
        else:
            self.storage_engine = "InnoDB"

        try:
            self.connector = pymysql.connect(**credentials)
        except Error as err:
            raise Exception("MariaDB connection error: {}".format(err))
        self.current_database = None
        if self.connector is not None:
            self.cursor = self.connector.cursor()

            if db is not None:
                self.current_database = db
                self.connector.select_db(db)
Esempio n. 7
0
def setup(TABLE_NAME):
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    # create database
    connector.create_database(DB_NAME)
    connector.createTable(TABLE_NAME, TITLES.format(TABLE_NAME,storage_engine))
    connector.close_connection()
Esempio n. 8
0
 def __init__(self, name, harvester_db=None):
     Iingester.__init__(self, name)
     # find global url/ add global URL
     g_url, created = global_url.objects.get_or_create(
         domain='http://arxiv.org', url='http://arxiv.org/abs/')
     self.global_url = g_url
     if harvester_db is None:
         self.harvester_db = get_config("DATABASES")["harvester"]
     else:
         self.harvester_db = harvester_db
     self.query = "SELECT * FROM {}.arxiv_articles WHERE last_harvested = 0".format(
         self.harvester_db)
     self.table_nr = 1
Esempio n. 9
0
    def setUp(self):
        self.connector = MariaDb(db="test_storage")
        storage_engine = get_config("MISC")["storage_engine"]
        # create tables for both sources arxiv and dblp
        self.connector.createTable("dblparticle",
                                   DBLP_ARTICLE.format(storage_engine))
        self.connector.createTable("arxivarticle",
                                   ARXIV_ARTICLE.format(storage_engine))
        # insert data
        dblp_article = (
            "dblpkey",  # key
            "2011-11-11",  # mdate
            "Andreas Anders;Bertha Theresa Balte;",  # authors
            "The Ultimate Title",  # title
            "10-14",  # pages
            datetime.date(2005, 1, 1),  # pub year
            "2",  # volume
            "journal of stuff",  # journal
            "3",  # journal number
            "http://google.de",  # doi
            "http://unused.com",  # unused url
            None,  # cite
            None,  # crossref
            None,  # booktitle
            None,  # school
            None,  # address
            None,  # publisher
            None,  # isbn
            None,  # series
            "article"  # type
        )

        arxiv_article = (
            "arxivkey",  # identifier
            "2007-07-07",  # created
            "2008-08-08",  # updated
            "Andreas Anders;Bertha Theresa Balte;Carim Chass Jr.;",  # authors
            "The Ultimate Title!",  # title
            None,  # mscclass
            None,  # acmclass
            None,  # reportno
            None,  # journalref
            None,  # comments
            "this is a test",  # description
            "category",  # categories
            "http://google.com",  # doi
            "2009-09-09"  # mdate
        )

        self.connector.execute_ex(ADD_DBLP_ARTICLE, dblp_article)
        self.connector.execute_ex(ADD_ARXIV, arxiv_article)
Esempio n. 10
0
def pdfdownloader(limit=None):
    """
    task for downloading pdf files, sending them to grobid and create single references
    :return:
    """
    # check if task is already running
    parameter_list = "[]"
    matches = 0
    active_queue = app.control.inspect().active()["celery@bremen"]
    for active_task in active_queue:
        if active_task["args"] == parameter_list:
            matches += 1
    if matches > 1:
        print("PDF Downloader Task is already running, skipping execution")
        return None

    # create logger
    logger = logging.getLogger("PDFDownloader")
    logger.setLevel(logging.INFO)
    # create the logging file handler
    log_file = os.path.join(log_dir, "pdf_downloader.log")
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    # add handler to logger object
    logger.addHandler(fh)
    # run actual task
    tmp = get_config("FOLDERS")["tmp"]
    grobid = get_config("WEAVER")["grobid"]
    if limit is None:
        limit = int(get_config("WEAVER")["pdf_limit"])

    logger.info("Init PDF Processing")
    obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit)
    logger.info("Start Processing")
    result = obj.process_pdf()
    logger.info(result)
Esempio n. 11
0
    def __init__(self, name, harvester_db=None):
        Iingester.__init__(self, name)

        # find global url/ add global URL
        g_url, created = global_url.objects.get_or_create(
            domain='http://citeseerx.ist.psu.edu/',
            url='http://citeseerx.ist.psu.edu/viewdoc/summary?doi=')
        self.global_url = g_url
        if harvester_db is None:
            self.harvester_db = get_config("DATABASES")["harvester"]
        else:
            self.harvester_db = harvester_db

        self.query = "SELECT * FROM {}.oaipmh_articles WHERE last_harvested = 0".format(
            self.harvester_db)
Esempio n. 12
0
 def __init__(self, name, harvesterdb=None):
     Iingester.__init__(self, name)
     # find global url/ add global URL
     g_url, created = global_url.objects.get_or_create(
         domain='http://dblp.uni-trier.de',
         url='http://dblp.uni-trier.de/rec/xml/',
     )
     self.global_url = g_url
     if harvesterdb is None:
         self.harvester_db = get_config("DATABASES")["harvester"]
     else:
         self.harvester_db = harvesterdb
     self.query = (
         "SELECT * FROM {}.dblp_article WHERE last_harvested = 0").format(
             self.harvester_db)
Esempio n. 13
0
def get_table_data(table, null_dates=True):
    credentials = dict(get_config("MARIADBX"))
    # connect to database
    connector = MariaDb(credentials)
    connector.connector.database = TESTDB
    # fetch everything
    query = "SELECT * FROM test_storage.{}".format(table)
    connector.cursor.execute(query)
    print(query)
    result = set()
    for dataset in connector.cursor:
        print(dataset)
        tmp = ()
        for element in dataset:
            # overwrite timestamps with generic date for easier testing
            if null_dates and isinstance(element, datetime.datetime):
                tmp += ((datetime.datetime(1990, 1, 1, 1, 1, 1), ))
            else:
                tmp += (element, )
        result.add(tmp)
    connector.close_connection()
    return result
Esempio n. 14
0
 def setUp(self):
     self.connector = MariaDb(db="test_storage")
     storage_engine = get_config("MISC")["storage_engine"]
     self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine))