def test_unicode1(self):
     title = normalize_title(
         "Enhancing Access Privacy of Range Retrievals over (𝔹+)-Trees."
     )  #dblp
     title2 = normalize_title(
         "Enhancing Access Privacy of Range Retrievals Over B+-trees"
     )  #citeseer
     self.assertEqual(title, title2)
 def test_unicode2(self):
     title = normalize_title(
         "Enhancing Access Privacy of Range Retrievals over (𝔹+)-Trees."
     )  #dblp
     title2 = normalize_title(
         "Enhancing Access Privacy of Range Retrievals over $({\rm B}^+)$-Trees"
     )  #dl.acm
     self.assertEqual(title, title2)
 def test_unicode4(self):
     title = normalize_title(
         "Sometime = Always + Recursion = Always on the Equivalence of the Intermittent and Invariant Assertions Methods for Proving Inevitability Properties of Programs."
     )
     self.assertEqual(
         "sometime always recursion always on the equivalence of the intermittent and invariant assertions methods for proving inevitability properties of programs",
         title)
Exemple #4
0
def dblp_mapping(query_tuple):
    return {
        "pub": query_tuple[2],
        "mdate": query_tuple[0],
        "normal": normalize_title(query_tuple[1]),
        "author": split_authors(query_tuple[3]),
    }
Exemple #5
0
def create_clusters():
    gurl = global_url.objects.get(id=1)
    open_ref_list = {}
    with open(os.path.join(file_path,'ref.log'),encoding="utf8") as f:
        for line in f:
            re.findall(r'alpha(.*?)bravo', line)
            if 'SOURCE' in line:
                regex= r'SOURCE (.*?):'
            else:
                regex = r'REF (.*?):'
            try:
                id_match = re.findall(regex, line)[0]
                title = line[line.index(":")+1:]
            except:
                continue

            # normalize title and transform 8 byte hex number to int
            normal_title = normalize_title(title)
            normal_id  = int(id_match, 16)
            # insert into cluster
            #cluster.objects.get_or_create(id= normal_id,defaults={"name":normal_title})
            # create local urls for matching titles

            if id_match in match_list:
                index = match_list.index(id_match)
                lurl, tmp =local_url.objects.get_or_create(id=normal_id,global_url=gurl, url=id_match)
                # creat open reference
                opref,tmp =OpenReferences.objects.get_or_create(id=normal_id,ingester_key=lurl,source_table=666,source_key=id_match)
                open_ref_list[id_match] = opref

    # creates single references directly from pdf:
    logger = logging.getLogger("PDFDownloader")
    logger.setLevel(logging.INFO)
    # create the logging file handler
    log_file = os.path.join(file_path, "pdf_downloader.log")
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    # add handler to logger object
    logger.addHandler(fh)
    # run actual task
    tmp = get_config("FOLDERS")["tmp"]
    grobid = get_config("WEAVER")["grobid"]
    limit = 20
    if limit is None:
        limit = int(get_config("WEAVER")["pdf_limit"])
    obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit)
    for element in match_list:
        pdf_path = os.path.join("C:\\Users\\anhtu\\Google Drive\\Informatik 2016\\Related Work\\evaluation",
                                "{}.pdf".format(element))
        obj.parse_references(pdf_path, open_ref_list[element])
Exemple #6
0
def oai_mapping(query_tuple):
    dates = query_tuple[2].split(";")
    del dates [-1]

    try:
        publication_date = datetime.datetime.strptime(dates[-1],"%Y")
    except ValueError:
        try:
            publication_date = datetime.datetime.strptime(dates[-1], "%Y-%m-%d")
        except ValueError:
            print(dates)
            raise Exception()

    return {
        "pub":  publication_date,
        "mdate": datetime.datetime.strptime(dates[0],"%Y-%m-%d"),
        "normal": normalize_title(query_tuple[0]),
        "author": split_authors(query_tuple[1]),
    }
Exemple #7
0
    def run(self, test_mode=False):
        today = date.today()
        compare_date = date.today(
        ) if test_mode is False else date.today() + datetime.timedelta(days=1)
        reference_list = []
        # fetch all single_references of an open_ref
        # fetch open_references to gather all ingester url_objects
        openreference_list = OpenReferences.objects.filter(
            last_updated__lt=compare_date).select_related('ingester_key').only(
                'ingester_key')[:self.limit]
        for open_ref in openreference_list:
            open_ref.last_updated = today
            open_ref.save()
            single_ref_list = SingleReference.objects.filter(
                source=open_ref).filter(Q(status='OP') | Q(status='INC')).only(
                    'tries', 'title', 'status').all()

            for single_ref in single_ref_list:
                title_matches = search_title(single_ref.title, threshold=0.8)
                if len(title_matches) == 1:
                    # single match: create reference for only match
                    PubReference.objects.get_or_create(
                        reference=title_matches[0],
                        source=open_ref.ingester_key,
                        defaults={
                            'original_title': single_ref.title,
                            'original_key': single_ref.id
                        })
                    #reference_list.append(PubReference(reference=title_matches[0], source=open_ref.ingester_key, original_title=single_ref.title))
                    single_ref.status = 'FIN'
                elif len(title_matches) == 0:
                    # no match: increment tries and set as incomplete
                    single_ref.tries += 1
                    single_ref.status = 'INC' if single_ref.tries < LIMBO_LIMIT else 'LIM'
                else:
                    # multi match:
                    normal_title = normalize_title(single_ref.title)
                    single_ref.status = 'LIM'
                    for title in title_matches:
                        if title.name == normal_title:
                            single_ref.status = 'FIN'
                            PubReference.objects.get_or_create(
                                reference=title,
                                source=open_ref.ingester_key,
                                defaults={
                                    'original_title': single_ref.title,
                                    'original_key': single_ref.id
                                })
                            #reference_list.append(PubReference(reference=title, source=open_ref.ingester_key, original_title=single_ref.title))
                single_ref.save()
        try:
            PubReference.objects.bulk_create(reference_list)
        except IntegrityError:
            # duplicate key on insert, insert everything manually
            for element in reference_list:
                print("Rollback")
                PubReference.objects.get_or_create(
                    reference=element.reference,
                    source=element.source,
                    original_title=single_ref.title,
                    original_key=single_ref.id)
 def test_lower_strip(self):
     title = normalize_title(" ThIs is a TITleß  ")
     self.assertEqual(title, "this is a titless")
 def test_unicode3(self):
     title = normalize_title(
         "Efficient controller synthesis for a fragment of MTL0,∞")
     self.assertEqual(
         "efficient controller synthesis for a fragment of mtl0", title)
 def test_whitespace(self):
     title = normalize_title("O`Rea\tlly?\n\n Wha\\t's t\nhe, iss\rue?  .")
     self.assertEqual(title, "oreally whats the issue")
 def test_punctuation(self):
     title = normalize_title("O`Really? What's the, iss-ue?  .")
     self.assertEqual(title, "oreally whats the issue")
Exemple #12
0
def evaluate_references():
    total_count = 0
    overall_count = 0
    truepositive = 0
    falsepositive = 0
    missing_count = 0
    limbo = 0
    dec_id_list = [int(item, 16) for item in match_list]
    fp_list = []
    ff_list = []
    db_list = []

    raw = pandas.read_csv(os.path.join(file_path, "count_ref.csv"),
                          index_col='ID')
    raw = raw.drop_duplicates()

    # look up dict for titles
    titles = {}
    with open(os.path.join(file_path, 'ref.log'), encoding="utf8") as f:
        for line in f:
            re.findall(r'alpha(.*?)bravo', line)
            if 'SOURCE' in line:
                regex = r'SOURCE (.*?):'
            else:
                regex = r'REF (.*?):'
            try:
                id_match = re.findall(regex, line)[0]
                normal_id = int(id_match, 16)
                title = line[line.index(":") + 1:]
                titles[normal_id] = title
            except:
                continue

    read_connector = pymysql.connect(user="******",
                                     password="******",
                                     host="localhost",
                                     database="storage",
                                     charset="utf8mb4")

    # copy references from db to list
    correct = open(os.path.join(file_path, 'correct40.txt'),
                   'w',
                   encoding="utf8")
    false = open(os.path.join(file_path, 'false40.txt'), 'w', encoding="utf8")
    missing = open(os.path.join(file_path, 'missing40.txt'),
                   'w',
                   encoding="utf8")

    with read_connector.cursor() as cursor:
        cursor.execute("SELECT * from storage.ingester_pubreference", ())
        for element in cursor:
            db_list.append({
                "id": element[0],
                "reference": element[1],
                "source": element[2],
                "title": element[3],
                "key": element[4],
                "match": False
            })

        cursor.execute(
            "SELECT COUNT(*) FROM storage.weaver_singlereference WHERE  status='LIM'"
        )
        for element in cursor:
            limbo = element[0]
    read_connector.close()
    for key, value in raw.iterrows():
        overall_count += 1
        # find true positives
        if value['source_paper'] in dec_id_list:
            # find matching db list entry
            total_count += 1
            count = 0
            for element1 in db_list:
                if element1['source'] == value['source_paper'] and element1[
                        'reference'] == value['ref_paper']:
                    source_title = titles[value['source_paper']].strip()
                    ref_title = titles[value['ref_paper']].strip()

                    correct.write("({},{})-->({},{})   [{},{}]\n".format(
                        value['source_paper'], source_title,
                        value['ref_paper'], ref_title, element1['title'],
                        element1['key']))
                    element1['match'] = True
                    truepositive += 1
                    count = 5
                    break

            if count == 0:
                missing_count += 1
                ff_list.append({
                    "source": value['source_paper'],
                    "reference": value['ref_paper'],
                })

    for element2 in db_list:
        if element2['match'] is False:
            try:
                source_title = titles[element2['source']].strip()
                ref_title = titles[element2['reference']].strip()
            except KeyError as e:
                print("Missing key", e)
                falsepositive += 1
                continue
                # SONDERFALL
            if normalize_title(
                    element2['title']) != normalize_title(ref_title):
                false.write("({},{})-->({},{})   [{},{}]\n".format(
                    element2['source'],
                    source_title,
                    element2['reference'],
                    ref_title,
                    element2['title'],
                    element2['key'],
                ))
                falsepositive += 1
            else:
                correct.write("({},{})-->({},{})   [{},{}]\n".format(
                    value['source_paper'], source_title, value['ref_paper'],
                    ref_title, element1['title'], element1['key']))
                truepositive += 1

    for element in ff_list:
        source = element['source'],
        source_title = titles[element['source']].strip()
        try:
            ref_title = titles[element['reference']].strip()
            missing.write("({},{})-->({},{})\n".format(
                source,
                source_title,
                element['reference'],
                ref_title,
            ))
        except Exception as e:
            print(e)
            continue

    correct.close()
    false.close()
    missing.close()

    print("Overall           : {}".format(overall_count))
    print("Total             : {}".format(total_count, truepositive,
                                          falsepositive))
    print("True Positive     : {}".format(truepositive))
    print("False Positive    : {}".format(falsepositive))
    print("Missing           : {}".format(missing_count))
    print("Limbo             : {}".format(limbo))