def create_clusters(): gurl = global_url.objects.get(id=1) open_ref_list = {} with open(os.path.join(file_path,'ref.log'),encoding="utf8") as f: for line in f: re.findall(r'alpha(.*?)bravo', line) if 'SOURCE' in line: regex= r'SOURCE (.*?):' else: regex = r'REF (.*?):' try: id_match = re.findall(regex, line)[0] title = line[line.index(":")+1:] except: continue # normalize title and transform 8 byte hex number to int normal_title = normalize_title(title) normal_id = int(id_match, 16) # insert into cluster #cluster.objects.get_or_create(id= normal_id,defaults={"name":normal_title}) # create local urls for matching titles if id_match in match_list: index = match_list.index(id_match) lurl, tmp =local_url.objects.get_or_create(id=normal_id,global_url=gurl, url=id_match) # creat open reference opref,tmp =OpenReferences.objects.get_or_create(id=normal_id,ingester_key=lurl,source_table=666,source_key=id_match) open_ref_list[id_match] = opref # creates single references directly from pdf: logger = logging.getLogger("PDFDownloader") logger.setLevel(logging.INFO) # create the logging file handler log_file = os.path.join(file_path, "pdf_downloader.log") fh = logging.FileHandler(log_file) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) # add handler to logger object logger.addHandler(fh) # run actual task tmp = get_config("FOLDERS")["tmp"] grobid = get_config("WEAVER")["grobid"] limit = 20 if limit is None: limit = int(get_config("WEAVER")["pdf_limit"]) obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit) for element in match_list: pdf_path = os.path.join("C:\\Users\\anhtu\\Google Drive\\Informatik 2016\\Related Work\\evaluation", "{}.pdf".format(element)) obj.parse_references(pdf_path, open_ref_list[element])
def setup(): NORMAL_TITLES = ("CREATE TABLE `Authors` (" " `Id` INT NOT NULL," " `main_name` TEXT," " `normal_name` TEXT," " `metaphone_name` TEXT," " PRIMARY KEY (`Id`)" ") ENGINE= {} CHARSET=utf8mb4") connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] connector.create_database(DB_NAME) connector.createTable("dvfds", NORMAL_TITLES.format(storage_engine)) try: connector.execute_ex( "CREATE FULLTEXT INDEX main_name_idx ON Authors (main_name)", ()) connector.execute_ex( "CREATE FULLTEXT INDEX normal_name_idx ON Authors (normal_name)", ()) connector.execute_ex( "CREATE FULLTEXT INDEX metaphone_name_idx ON Authors (metaphone_name)", ()) except: print("Index already exists") connector.close_connection()
def setup_tables(filename, table_query, insert_query): # load testconfig credentials = dict(get_config("MARIADBX")) # setup database connector = MariaDb(credentials) connector.create_db(TESTDB) connector.connector.database = TESTDB connector.createTable("test dblp table", table_query) # setup test ingester database # setup_database(TESTDB) # import records from csv with open(filename, newline='', encoding='utf-8') as csvfile: spamreader = csv.reader(csvfile, delimiter=';', quotechar='"') do_once = False for row in spamreader: # remove last updated and harvest date del row[-2:] # skip first line if do_once is True: tup = tuple(map(lambda x: x if x != "" else None, row)) connector.execute_ex(insert_query, tup) else: do_once = True connector.close_connection()
def setup(): """ create database and table structure :return: """ connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] # create database connector.create_database(DB_NAME) connector.createTable("dates", DATE_TABLE.format(storage_engine)) connector.createTable("publication year", PUB_YEAR_TABLE.format(storage_engine)) connector.createTable("popular_words", POPULAR.format(storage_engine)) connector.createTable("title_length", TITLE_LENGTH.format(storage_engine)) connector.createTable("popular names", N_POPULAR.format(storage_engine)) connector.createTable("number authors", NUM_AUTHOR.format(storage_engine)) connector.createTable("Authors", AUTHORS.format(storage_engine)) connector.createTable("Normal Titles", NORMAL_TITLES.format(storage_engine)) connector.createTable("Career",CAREER.format(storage_engine)) # create index try: connector.execute_ex("CREATE FULLTEXT INDEX title_idx ON normal_title (titles)", ()) except: print("Index already exists") connector.close_connection()
def referencertask(limit=None): parameter_list = "[]" matches = 0 active_queue = app.control.inspect().active()["celery@bremen"] for active_task in active_queue: if active_task["args"] == parameter_list: matches += 1 if matches > 1: print("Referencer Task is already running, skipping execution") return None # create logger logger = logging.getLogger("Referencer") logger.setLevel(logging.INFO) # create the logging file handler log_file = os.path.join(log_dir, "referencer.log") fh = logging.FileHandler(log_file) formatter = logging.Formatter('%(levelname)s - %(message)s') fh.setFormatter(formatter) # add handler to logger object logger.addHandler(fh) logger.info("Start Referencer") if limit is None: limit = int(get_config("WEAVER")["referencer_limit"]) ref = Referencer(limit, logger=logger) ref.run() logger.info("Finished Referencer")
def __init__(self, credentials=None, db=None): self.query = None if credentials is None: credentials = dict(get_config("MARIADBX")) self.storage_engine = get_config("MISC")["storage_engine"] else: self.storage_engine = "InnoDB" try: self.connector = pymysql.connect(**credentials) except Error as err: raise Exception("MariaDB connection error: {}".format(err)) self.current_database = None if self.connector is not None: self.cursor = self.connector.cursor() if db is not None: self.current_database = db self.connector.select_db(db)
def setup(TABLE_NAME): """ create database and table structure :return: """ connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] # create database connector.create_database(DB_NAME) connector.createTable(TABLE_NAME, TITLES.format(TABLE_NAME,storage_engine)) connector.close_connection()
def __init__(self, name, harvester_db=None): Iingester.__init__(self, name) # find global url/ add global URL g_url, created = global_url.objects.get_or_create( domain='http://arxiv.org', url='http://arxiv.org/abs/') self.global_url = g_url if harvester_db is None: self.harvester_db = get_config("DATABASES")["harvester"] else: self.harvester_db = harvester_db self.query = "SELECT * FROM {}.arxiv_articles WHERE last_harvested = 0".format( self.harvester_db) self.table_nr = 1
def setUp(self): self.connector = MariaDb(db="test_storage") storage_engine = get_config("MISC")["storage_engine"] # create tables for both sources arxiv and dblp self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine)) self.connector.createTable("arxivarticle", ARXIV_ARTICLE.format(storage_engine)) # insert data dblp_article = ( "dblpkey", # key "2011-11-11", # mdate "Andreas Anders;Bertha Theresa Balte;", # authors "The Ultimate Title", # title "10-14", # pages datetime.date(2005, 1, 1), # pub year "2", # volume "journal of stuff", # journal "3", # journal number "http://google.de", # doi "http://unused.com", # unused url None, # cite None, # crossref None, # booktitle None, # school None, # address None, # publisher None, # isbn None, # series "article" # type ) arxiv_article = ( "arxivkey", # identifier "2007-07-07", # created "2008-08-08", # updated "Andreas Anders;Bertha Theresa Balte;Carim Chass Jr.;", # authors "The Ultimate Title!", # title None, # mscclass None, # acmclass None, # reportno None, # journalref None, # comments "this is a test", # description "category", # categories "http://google.com", # doi "2009-09-09" # mdate ) self.connector.execute_ex(ADD_DBLP_ARTICLE, dblp_article) self.connector.execute_ex(ADD_ARXIV, arxiv_article)
def pdfdownloader(limit=None): """ task for downloading pdf files, sending them to grobid and create single references :return: """ # check if task is already running parameter_list = "[]" matches = 0 active_queue = app.control.inspect().active()["celery@bremen"] for active_task in active_queue: if active_task["args"] == parameter_list: matches += 1 if matches > 1: print("PDF Downloader Task is already running, skipping execution") return None # create logger logger = logging.getLogger("PDFDownloader") logger.setLevel(logging.INFO) # create the logging file handler log_file = os.path.join(log_dir, "pdf_downloader.log") fh = logging.FileHandler(log_file) formatter = logging.Formatter('%(levelname)s - %(message)s') fh.setFormatter(formatter) # add handler to logger object logger.addHandler(fh) # run actual task tmp = get_config("FOLDERS")["tmp"] grobid = get_config("WEAVER")["grobid"] if limit is None: limit = int(get_config("WEAVER")["pdf_limit"]) logger.info("Init PDF Processing") obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit) logger.info("Start Processing") result = obj.process_pdf() logger.info(result)
def __init__(self, name, harvester_db=None): Iingester.__init__(self, name) # find global url/ add global URL g_url, created = global_url.objects.get_or_create( domain='http://citeseerx.ist.psu.edu/', url='http://citeseerx.ist.psu.edu/viewdoc/summary?doi=') self.global_url = g_url if harvester_db is None: self.harvester_db = get_config("DATABASES")["harvester"] else: self.harvester_db = harvester_db self.query = "SELECT * FROM {}.oaipmh_articles WHERE last_harvested = 0".format( self.harvester_db)
def __init__(self, name, harvesterdb=None): Iingester.__init__(self, name) # find global url/ add global URL g_url, created = global_url.objects.get_or_create( domain='http://dblp.uni-trier.de', url='http://dblp.uni-trier.de/rec/xml/', ) self.global_url = g_url if harvesterdb is None: self.harvester_db = get_config("DATABASES")["harvester"] else: self.harvester_db = harvesterdb self.query = ( "SELECT * FROM {}.dblp_article WHERE last_harvested = 0").format( self.harvester_db)
def get_table_data(table, null_dates=True): credentials = dict(get_config("MARIADBX")) # connect to database connector = MariaDb(credentials) connector.connector.database = TESTDB # fetch everything query = "SELECT * FROM test_storage.{}".format(table) connector.cursor.execute(query) print(query) result = set() for dataset in connector.cursor: print(dataset) tmp = () for element in dataset: # overwrite timestamps with generic date for easier testing if null_dates and isinstance(element, datetime.datetime): tmp += ((datetime.datetime(1990, 1, 1, 1, 1, 1), )) else: tmp += (element, ) result.add(tmp) connector.close_connection() return result
def setUp(self): self.connector = MariaDb(db="test_storage") storage_engine = get_config("MISC")["storage_engine"] self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine))