def test_unicode1(self): title = normalize_title( "Enhancing Access Privacy of Range Retrievals over (𝔹+)-Trees." ) #dblp title2 = normalize_title( "Enhancing Access Privacy of Range Retrievals Over B+-trees" ) #citeseer self.assertEqual(title, title2)
def test_unicode2(self): title = normalize_title( "Enhancing Access Privacy of Range Retrievals over (𝔹+)-Trees." ) #dblp title2 = normalize_title( "Enhancing Access Privacy of Range Retrievals over $({\rm B}^+)$-Trees" ) #dl.acm self.assertEqual(title, title2)
def test_unicode4(self): title = normalize_title( "Sometime = Always + Recursion = Always on the Equivalence of the Intermittent and Invariant Assertions Methods for Proving Inevitability Properties of Programs." ) self.assertEqual( "sometime always recursion always on the equivalence of the intermittent and invariant assertions methods for proving inevitability properties of programs", title)
def dblp_mapping(query_tuple): return { "pub": query_tuple[2], "mdate": query_tuple[0], "normal": normalize_title(query_tuple[1]), "author": split_authors(query_tuple[3]), }
def create_clusters(): gurl = global_url.objects.get(id=1) open_ref_list = {} with open(os.path.join(file_path,'ref.log'),encoding="utf8") as f: for line in f: re.findall(r'alpha(.*?)bravo', line) if 'SOURCE' in line: regex= r'SOURCE (.*?):' else: regex = r'REF (.*?):' try: id_match = re.findall(regex, line)[0] title = line[line.index(":")+1:] except: continue # normalize title and transform 8 byte hex number to int normal_title = normalize_title(title) normal_id = int(id_match, 16) # insert into cluster #cluster.objects.get_or_create(id= normal_id,defaults={"name":normal_title}) # create local urls for matching titles if id_match in match_list: index = match_list.index(id_match) lurl, tmp =local_url.objects.get_or_create(id=normal_id,global_url=gurl, url=id_match) # creat open reference opref,tmp =OpenReferences.objects.get_or_create(id=normal_id,ingester_key=lurl,source_table=666,source_key=id_match) open_ref_list[id_match] = opref # creates single references directly from pdf: logger = logging.getLogger("PDFDownloader") logger.setLevel(logging.INFO) # create the logging file handler log_file = os.path.join(file_path, "pdf_downloader.log") fh = logging.FileHandler(log_file) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) # add handler to logger object logger.addHandler(fh) # run actual task tmp = get_config("FOLDERS")["tmp"] grobid = get_config("WEAVER")["grobid"] limit = 20 if limit is None: limit = int(get_config("WEAVER")["pdf_limit"]) obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit) for element in match_list: pdf_path = os.path.join("C:\\Users\\anhtu\\Google Drive\\Informatik 2016\\Related Work\\evaluation", "{}.pdf".format(element)) obj.parse_references(pdf_path, open_ref_list[element])
def oai_mapping(query_tuple): dates = query_tuple[2].split(";") del dates [-1] try: publication_date = datetime.datetime.strptime(dates[-1],"%Y") except ValueError: try: publication_date = datetime.datetime.strptime(dates[-1], "%Y-%m-%d") except ValueError: print(dates) raise Exception() return { "pub": publication_date, "mdate": datetime.datetime.strptime(dates[0],"%Y-%m-%d"), "normal": normalize_title(query_tuple[0]), "author": split_authors(query_tuple[1]), }
def run(self, test_mode=False): today = date.today() compare_date = date.today( ) if test_mode is False else date.today() + datetime.timedelta(days=1) reference_list = [] # fetch all single_references of an open_ref # fetch open_references to gather all ingester url_objects openreference_list = OpenReferences.objects.filter( last_updated__lt=compare_date).select_related('ingester_key').only( 'ingester_key')[:self.limit] for open_ref in openreference_list: open_ref.last_updated = today open_ref.save() single_ref_list = SingleReference.objects.filter( source=open_ref).filter(Q(status='OP') | Q(status='INC')).only( 'tries', 'title', 'status').all() for single_ref in single_ref_list: title_matches = search_title(single_ref.title, threshold=0.8) if len(title_matches) == 1: # single match: create reference for only match PubReference.objects.get_or_create( reference=title_matches[0], source=open_ref.ingester_key, defaults={ 'original_title': single_ref.title, 'original_key': single_ref.id }) #reference_list.append(PubReference(reference=title_matches[0], source=open_ref.ingester_key, original_title=single_ref.title)) single_ref.status = 'FIN' elif len(title_matches) == 0: # no match: increment tries and set as incomplete single_ref.tries += 1 single_ref.status = 'INC' if single_ref.tries < LIMBO_LIMIT else 'LIM' else: # multi match: normal_title = normalize_title(single_ref.title) single_ref.status = 'LIM' for title in title_matches: if title.name == normal_title: single_ref.status = 'FIN' PubReference.objects.get_or_create( reference=title, source=open_ref.ingester_key, defaults={ 'original_title': single_ref.title, 'original_key': single_ref.id }) #reference_list.append(PubReference(reference=title, source=open_ref.ingester_key, original_title=single_ref.title)) single_ref.save() try: PubReference.objects.bulk_create(reference_list) except IntegrityError: # duplicate key on insert, insert everything manually for element in reference_list: print("Rollback") PubReference.objects.get_or_create( reference=element.reference, source=element.source, original_title=single_ref.title, original_key=single_ref.id)
def test_lower_strip(self): title = normalize_title(" ThIs is a TITleß ") self.assertEqual(title, "this is a titless")
def test_unicode3(self): title = normalize_title( "Efficient controller synthesis for a fragment of MTL0,∞") self.assertEqual( "efficient controller synthesis for a fragment of mtl0", title)
def test_whitespace(self): title = normalize_title("O`Rea\tlly?\n\n Wha\\t's t\nhe, iss\rue? .") self.assertEqual(title, "oreally whats the issue")
def test_punctuation(self): title = normalize_title("O`Really? What's the, iss-ue? .") self.assertEqual(title, "oreally whats the issue")
def evaluate_references(): total_count = 0 overall_count = 0 truepositive = 0 falsepositive = 0 missing_count = 0 limbo = 0 dec_id_list = [int(item, 16) for item in match_list] fp_list = [] ff_list = [] db_list = [] raw = pandas.read_csv(os.path.join(file_path, "count_ref.csv"), index_col='ID') raw = raw.drop_duplicates() # look up dict for titles titles = {} with open(os.path.join(file_path, 'ref.log'), encoding="utf8") as f: for line in f: re.findall(r'alpha(.*?)bravo', line) if 'SOURCE' in line: regex = r'SOURCE (.*?):' else: regex = r'REF (.*?):' try: id_match = re.findall(regex, line)[0] normal_id = int(id_match, 16) title = line[line.index(":") + 1:] titles[normal_id] = title except: continue read_connector = pymysql.connect(user="******", password="******", host="localhost", database="storage", charset="utf8mb4") # copy references from db to list correct = open(os.path.join(file_path, 'correct40.txt'), 'w', encoding="utf8") false = open(os.path.join(file_path, 'false40.txt'), 'w', encoding="utf8") missing = open(os.path.join(file_path, 'missing40.txt'), 'w', encoding="utf8") with read_connector.cursor() as cursor: cursor.execute("SELECT * from storage.ingester_pubreference", ()) for element in cursor: db_list.append({ "id": element[0], "reference": element[1], "source": element[2], "title": element[3], "key": element[4], "match": False }) cursor.execute( "SELECT COUNT(*) FROM storage.weaver_singlereference WHERE status='LIM'" ) for element in cursor: limbo = element[0] read_connector.close() for key, value in raw.iterrows(): overall_count += 1 # find true positives if value['source_paper'] in dec_id_list: # find matching db list entry total_count += 1 count = 0 for element1 in db_list: if element1['source'] == value['source_paper'] and element1[ 'reference'] == value['ref_paper']: source_title = titles[value['source_paper']].strip() ref_title = titles[value['ref_paper']].strip() correct.write("({},{})-->({},{}) [{},{}]\n".format( value['source_paper'], source_title, value['ref_paper'], ref_title, element1['title'], element1['key'])) element1['match'] = True truepositive += 1 count = 5 break if count == 0: missing_count += 1 ff_list.append({ "source": value['source_paper'], "reference": value['ref_paper'], }) for element2 in db_list: if element2['match'] is False: try: source_title = titles[element2['source']].strip() ref_title = titles[element2['reference']].strip() except KeyError as e: print("Missing key", e) falsepositive += 1 continue # SONDERFALL if normalize_title( element2['title']) != normalize_title(ref_title): false.write("({},{})-->({},{}) [{},{}]\n".format( element2['source'], source_title, element2['reference'], ref_title, element2['title'], element2['key'], )) falsepositive += 1 else: correct.write("({},{})-->({},{}) [{},{}]\n".format( value['source_paper'], source_title, value['ref_paper'], ref_title, element1['title'], element1['key'])) truepositive += 1 for element in ff_list: source = element['source'], source_title = titles[element['source']].strip() try: ref_title = titles[element['reference']].strip() missing.write("({},{})-->({},{})\n".format( source, source_title, element['reference'], ref_title, )) except Exception as e: print(e) continue correct.close() false.close() missing.close() print("Overall : {}".format(overall_count)) print("Total : {}".format(total_count, truepositive, falsepositive)) print("True Positive : {}".format(truepositive)) print("False Positive : {}".format(falsepositive)) print("Missing : {}".format(missing_count)) print("Limbo : {}".format(limbo))