예제 #1
0
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the data of a virtual author with all the data of
    a real author.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info("|-> Start of data comparison (va %s : ra %s)" %
                        (va_id, ra_id))

    ra_data = get_realauthor_data(ra_id, "module_tag")
    va_data_set = get_information_from_dataset(va_id)

    if (len(ra_data) == 0) and (len(va_data_set) == 0):
        bconfig.LOGGER.info("|-> End of data comparison (Sets empty)")
        return 0

    parity = len(ra_data)

    # Your probability assessment function here:
    certainty = 1 - exp(-.8 * pow(len(parity), .7))

    bconfig.LOGGER.info("|--> Found %s matching information out of %s "
                        "on the paper. Result: %s%% similarity" %
                        (len(parity), len(va_data_set), certainty))

    return certainty
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the data of a virtual author with all the data of
    a real author.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info("|-> Start of IID comparison (va %s : ra %s)"
                  % (va_id, ra_id))

    ra_data = get_realauthor_data(ra_id, "inspireid")
    va_data_set = get_information_from_dataset(va_id)

    if (len(ra_data) == 0) and (len(va_data_set) == 0):
        bconfig.LOGGER.info("|-> End of IID comparison (Sets empty)")
        return 0.0

    # Your probability assessment function here:
    if va_data_set and ra_data and not ra_data[0]['value'] in va_data_set:
        bconfig.LOGGER.log(25, "|-> IID parity detected"
                      + " -> Impossibility of author equality")
        return 1.0
    elif va_data_set and ra_data and ra_data[0]['value'] in va_data_set:
        return 10000.0
    else:
        return 0.0
예제 #3
0
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the data of a virtual author with all the data of
    a real author.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info("|-> Start of data comparison (va %s : ra %s)"
                  % (va_id, ra_id))

    ra_data = get_realauthor_data(ra_id, "module_tag")
    va_data_set = get_information_from_dataset(va_id)

    if (len(ra_data) == 0) and (len(va_data_set) == 0):
        bconfig.LOGGER.info("|-> End of data comparison (Sets empty)")
        return 0

    parity = len(ra_data)

    # Your probability assessment function here:
    certainty = 1 - exp(-.8 * pow(len(parity), .7))

    bconfig.LOGGER.info("|--> Found %s matching information out of %s "
                        "on the paper. Result: %s%% similarity"
                        % (len(parity), len(va_data_set), certainty))

    return certainty
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the currently processed paper with the list of already attributed
    papers of the real author. Should the currently processed paper be
    amongst the list of papers of the real author, the returned value will be
    1--the highest probability. And 0 otherwise.

    Due to the configuration of this function in the configuration file,
    a parity of the papers will nullify the entire calculation.

    @param va_id: ID of the virtual author
    @type va_id: int
    @param ra_id: ID of the real author
    @type ra_id: int

    @return: The probability resulting from the paper equality comparison.
    @rtype: float
    '''
    va_records_raw = get_virtualauthor_records(va_id, "bibrec_id")
    ra_records_raw = get_realauthor_data(ra_id, "bibrec_id")
    paper_parity = 0
    va_records = []
    ra_records = []

    for i in va_records_raw:
        va_records.append(i['value'])

    for i in ra_records_raw:
        ra_records.append(i['value'])

    for va_record in va_records:
        if va_record in ra_records:
            paper_parity += 1

    if paper_parity > 0:
        bconfig.LOGGER.warn("|-> Paper parity detected"
                      + " -> Impossibility of author equality")
        return 1.0
    else:
        return 0.0
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the currently processed paper with the list of already attributed
    papers of the real author. Should the currently processed paper be
    amongst the list of papers of the real author, the returned value will be
    1--the highest probability. And 0 otherwise.

    Due to the configuration of this function in the configuration file,
    a parity of the papers will nullify the entire calculation.

    @param va_id: ID of the virtual author
    @type va_id: int
    @param ra_id: ID of the real author
    @type ra_id: int

    @return: The probability resulting from the paper equality comparison.
    @rtype: float
    '''
    va_records_raw = get_virtualauthor_records(va_id, "bibrec_id")
    ra_records_raw = get_realauthor_data(ra_id, "bibrec_id")
    paper_parity = 0
    va_records = []
    ra_records = []

    for i in va_records_raw:
        va_records.append(i['value'])

    for i in ra_records_raw:
        ra_records.append(i['value'])

    for va_record in va_records:
        if va_record in ra_records:
            paper_parity += 1

    if paper_parity > 0:
        bconfig.LOGGER.warn("|-> Paper parity detected" +
                            " -> Impossibility of author equality")
        return 1.0
    else:
        return 0.0
예제 #6
0
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the data of a virtual author with all the data of
    a real author.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info("|-> Start of citation comparison (va %s : ra %s)" %
                        (va_id, ra_id))

    ra_cites = get_realauthor_data(ra_id, "outgoing_citation")

    try:
        ra_cites_set = set([int(row['value']) for row in ra_cites])
    except ValueError:
        bconfig.LOGGER.exception("A str to int conversion error occured"
                                 "while processing the cites list.")
        return 0.0

    try:
        ra_topcites_set = set(
            [int(row['value']) for row in ra_cites if row['va_count'] > 1])
    except ValueError:
        bconfig.LOGGER.exception("A str to int conversion error occured"
                                 "while processing the topcites list.")
        return 0.0

    va_cites_set = set(get_information_from_dataset(va_id))
    va_cites_len = len(va_cites_set)

    if (not ra_cites) and (not va_cites_len):
        bconfig.LOGGER.info("|-> End of cite comparison (Sets empty)")
        return 0.0

    total_parity = len(ra_cites_set.intersection(va_cites_set))
    total_union = len(ra_cites_set.union(va_cites_set))
    topcite_parity = len(ra_topcites_set.intersection(va_cites_set))
    jaccard_similarity = 0.0
    va_to_ra_topcites_ratio = 0.0

    if total_union > 0.0:
        jaccard_similarity = float(total_parity) / float(total_union)

    if va_cites_len > 0.0:
        va_to_ra_topcites_ratio = float(topcite_parity) / float(va_cites_len)

    certainty = max(jaccard_similarity, va_to_ra_topcites_ratio)

    if certainty > 0.14:
        certainty = sqrt(certainty) + 0.1
    else:
        certainty = 0.0

    if jaccard_similarity >= va_to_ra_topcites_ratio:
        bconfig.LOGGER.info("|--> Found %s matching cites out of %s "
                            "on the paper. Result: %s%% similarity" %
                            (total_parity, va_cites_len, certainty))
    else:
        bconfig.LOGGER.info("|--> Found %s matching top cites out of %s "
                            "on the paper. Result: %s%% similarity" %
                            (topcite_parity, va_cites_len, certainty))

    return min(certainty, 1.0)
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the coauthors of a virtual author with all the coauthors of
    a real author. If a collaboration is detected on both sides, these
    collaboration detachments will be compared as well.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info(
        "|-> Start of coauthorship comparison (va %s : ra %s)" %
        (va_id, ra_id))

    ra_coauth_set = set()

    ra_coauthors_data = get_realauthor_data(ra_id, "coauthor")
    va_coauth_set = get_information_from_dataset(va_id)
    va_coauth_set_format = set()

    #    max_coauthors = int(get_config_parameter('MAX_COAUTHORS')[0])
    max_coauthors = MAX_COAUTHORS

    if (len(ra_coauthors_data) == 0) and (len(va_coauth_set) == 0):
        bconfig.LOGGER.info("|-> End of coauthorship comparison (Sets empty)")
        return 0

    if (len(va_coauth_set) > max_coauthors):
        bconfig.LOGGER.info("|--> Many coauthors found. Will try hash" +
                            " values for collaboration testing.")
        hashed = str(hash_coauthor_set(va_coauth_set))

        for coauthor_data in ra_coauthors_data:
            if coauthor_data['value'].split(";;")[1] == hashed:
                bconfig.LOGGER.info("|---> Hash found! Assuming "
                                    "collaboration attachment.")
                return 1.0

        bconfig.LOGGER.info("|---> Hash NOT found. Skipping metric.")
        return 0

    for rcoauthor_data in ra_coauthors_data:
        ra_coauth_set.add(rcoauthor_data['value'].split(";;")[1])

    for vcoauthor_data in va_coauth_set:
        va_coauth_set_format.add(create_unified_name(vcoauthor_data.lower()))

    parity = ra_coauth_set.intersection(va_coauth_set_format)

    certainty = 0

    for collaborationsearch in parity:
        if collaborationsearch.count("ollaboration"):
            bconfig.LOGGER.info("|--> Found matching collaboration: %s" %
                                (collaborationsearch))
            return 1.0

    if len(va_coauth_set) > 0:
        certainty = 1 - exp(-.8 * pow(len(parity), .7))

    bconfig.LOGGER.info("|--> Found %s matching coauthors out of %s "
                        "on the paper. Result: %s%% similarity" %
                        (len(parity), len(va_coauth_set), certainty))

    return certainty
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the affiliation of a virtual author with all the affiliations of
    a real author.

    Distribution of probabilities for the time delta: e^(-0.05x^.7)
    Where x is the difference of the dates in month.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''

    bconfig.LOGGER.info("|-> Start of affiliation comparison (va %s : ra %s)" %
                        (va_id, ra_id))

    ra_affiliation_data = get_realauthor_data(ra_id, "affiliation")
    va_affiliation_data = get_information_from_dataset(va_id)

    if (len(ra_affiliation_data) == 0) and (len(va_affiliation_data) == 0):
        bconfig.LOGGER.info("|-> End of affiliation comparison")
        return 0

    ra_dict = dict()
    va_dict = dict()

    for ra_affiliation in ra_affiliation_data:
        ra_data = ra_affiliation['value'].split(";;")

        if ra_dict.has_key(ra_data[2]):
            ra_dict[ra_data[2]] += [ra_data[0]]
        else:
            ra_dict[ra_data[2]] = [ra_data[0]]

    for va_affiliation in va_affiliation_data:
        va_data = va_affiliation.split(";;")
        if va_dict.has_key(va_data[2]):
            va_dict[va_data[2]] += [va_data[0]]
        else:
            va_dict[va_data[2]] = [va_data[0]]

    probability = 0.0
    aff_common = set(ra_dict.keys()).intersection(set(va_dict.keys()))

    aff_date_common = set([y for i in ra_dict.values()
                           for y in i]).intersection(
                               set([x for j in va_dict.values() for x in j]))

    aff_p = []
    aff_date_p = [0]

    if len(aff_common) > 0:
        for aff in aff_common:
            if aff == "None":
                bconfig.LOGGER.info(
                    "|--> Only 'unknown' affiliation in common." +
                    " Doesn't help => skip.")
                aff_p.append(0)
            else:
                bconfig.LOGGER.info("|--> Nice: va in ra")
                aff_p.append(1)

                if len(aff_date_common) > 0:
                    bconfig.LOGGER.info(
                        "|---> Date Matches found: %s => "
                        "Horray?! Dates: %s ... " %
                        (len(aff_date_common), aff_date_common))

                for ra_date in ra_dict[aff]:
                    for va_date in va_dict[aff]:
                        rdate = ra_date.split("-")
                        vdate = va_date.split("-")

                        if not (rdate[0] == '0000' or vdate[0] == '0000'):
                            ryear = str_to_int(rdate[0])
                            rmonth = str_to_int(rdate[1])
                            vyear = str_to_int(vdate[0])
                            vmonth = str_to_int(vdate[1])

                            if rmonth == 0:
                                rmonth = 10

                            if vmonth == 0:
                                vmonth = 10

                            time_delta = (date(ryear, rmonth, 1) -
                                          date(vyear, vmonth, 1))
                            date_delta = abs(time_delta.days / 30)

                            if date_delta <= 600:
                                if ((len(aff_date_p) == 1)
                                        and (aff_date_p[0] == 0)):
                                    aff_date_p.pop()

                                result = exp(-.05 * pow(date_delta, .7))
                                bconfig.LOGGER.debug("|---> Delta: %s => "
                                                     "Result: %s" %
                                                     (date_delta, result))
                                aff_date_p.append(result)

                            else:
                                bconfig.LOGGER.debug("|---> Date delta "
                                                     "too high.")
                                aff_date_p.append(0)
                        else:
                            bconfig.LOGGER.debug("|---> Date delta not "
                                                 "computable")

#        probability = average(aff_p) + average(aff_date_p)
        probability = ((float(sum(aff_p)) / len(aff_p)) +
                       (float(sum(aff_date_p)) / len(aff_date_p)))
    else:
        bconfig.LOGGER.info("|--> No affiliation in common.")
        probability = 0

    bconfig.LOGGER.info("|--> Affiliation comparison result: %s" %
                        (probability / 2))
    bconfig.LOGGER.info("|-> End of affiliation comparison")

    return probability / 2
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the affiliation of a virtual author with all the affiliations of
    a real author.

    Distribution of probabilities for the time delta: e^(-0.05x^.7)
    Where x is the difference of the dates in month.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''

    bconfig.LOGGER.info("|-> Start of affiliation comparison (va %s : ra %s)"
                  % (va_id, ra_id))

    ra_affiliation_data = get_realauthor_data(ra_id, "affiliation")
    va_affiliation_data = get_information_from_dataset(va_id)

    if (len(ra_affiliation_data) == 0) and (len(va_affiliation_data) == 0):
        bconfig.LOGGER.info("|-> End of affiliation comparison")
        return 0

    ra_dict = dict()
    va_dict = dict()

    for ra_affiliation in ra_affiliation_data:
        ra_data = ra_affiliation['value'].split(";;")

        if ra_dict.has_key(ra_data[2]):
            ra_dict[ra_data[2]] += [ra_data[0]]
        else:
            ra_dict[ra_data[2]] = [ra_data[0]]

    for va_affiliation in va_affiliation_data:
        va_data = va_affiliation.split(";;")
        if va_dict.has_key(va_data[2]):
            va_dict[va_data[2]] += [va_data[0]]
        else:
            va_dict[va_data[2]] = [va_data[0]]

    probability = 0.0
    aff_common = set(ra_dict.keys()).intersection(set(va_dict.keys()))

    aff_date_common = set([y for i in ra_dict.values()
                           for y in i]).intersection(set([x for j in
                                                          va_dict.values()
                                                          for x in j]))

    aff_p = []
    aff_date_p = [0]

    if len(aff_common) > 0:
        for aff in aff_common:
            if aff == "None":
                bconfig.LOGGER.info("|--> Only 'unknown' affiliation in common."
                              + " Doesn't help => skip.")
                aff_p.append(0)
            else:
                bconfig.LOGGER.info("|--> Nice: va in ra")
                aff_p.append(1)

                if len(aff_date_common) > 0:
                    bconfig.LOGGER.info("|---> Date Matches found: %s => "
                                        "Horray?! Dates: %s ... "
                                   % (len(aff_date_common), aff_date_common))

                for ra_date in ra_dict[aff]:
                    for va_date in va_dict[aff]:
                        rdate = ra_date.split("-")
                        vdate = va_date.split("-")

                        if not (rdate[0] == '0000' or vdate[0] == '0000'):
                            ryear = str_to_int(rdate[0])
                            rmonth = str_to_int(rdate[1])
                            vyear = str_to_int(vdate[0])
                            vmonth = str_to_int(vdate[1])

                            if rmonth == 0:
                                rmonth = 10

                            if vmonth == 0:
                                vmonth = 10

                            time_delta = (date(ryear, rmonth, 1) -
                                          date(vyear, vmonth, 1))
                            date_delta = abs(time_delta.days / 30)

                            if date_delta <= 600:
                                if ((len(aff_date_p) == 1)
                                    and (aff_date_p[0] == 0)):
                                    aff_date_p.pop()

                                result = exp(-.05 * pow(date_delta, .7))
                                bconfig.LOGGER.debug("|---> Delta: %s => "
                                                     "Result: %s"
                                                     % (date_delta, result))
                                aff_date_p.append(result)

                            else:
                                bconfig.LOGGER.debug("|---> Date delta "
                                                     "too high.")
                                aff_date_p.append(0)
                        else:
                            bconfig.LOGGER.debug("|---> Date delta not "
                                                 "computable")

#        probability = average(aff_p) + average(aff_date_p)
        probability = ((float(sum(aff_p)) / len (aff_p))
                       + (float(sum(aff_date_p)) / len (aff_date_p)))
    else:
        bconfig.LOGGER.info("|--> No affiliation in common.")
        probability = 0

    bconfig.LOGGER.info("|--> Affiliation comparison result: %s"
                  % (probability / 2))
    bconfig.LOGGER.info("|-> End of affiliation comparison")

    return probability / 2
예제 #10
0
def compare_va_to_ra(va_id, ra_id):
    """
    Compares the data of a virtual author with all the data of
    a real author.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    """
    bconfig.LOGGER.info("|-> Start of citation comparison (va %s : ra %s)" % (va_id, ra_id))

    ra_cites = get_realauthor_data(ra_id, "outgoing_citation")

    try:
        ra_cites_set = set([int(row["value"]) for row in ra_cites])
    except ValueError:
        bconfig.LOGGER.exception("A str to int conversion error occured" "while processing the cites list.")
        return 0.0

    try:
        ra_topcites_set = set([int(row["value"]) for row in ra_cites if row["va_count"] > 1])
    except ValueError:
        bconfig.LOGGER.exception("A str to int conversion error occured" "while processing the topcites list.")
        return 0.0

    va_cites_set = set(get_information_from_dataset(va_id))
    va_cites_len = len(va_cites_set)

    if (not ra_cites) and (not va_cites_len):
        bconfig.LOGGER.info("|-> End of cite comparison (Sets empty)")
        return 0.0

    total_parity = len(ra_cites_set.intersection(va_cites_set))
    total_union = len(ra_cites_set.union(va_cites_set))
    topcite_parity = len(ra_topcites_set.intersection(va_cites_set))
    jaccard_similarity = 0.0
    va_to_ra_topcites_ratio = 0.0

    if total_union > 0.0:
        jaccard_similarity = float(total_parity) / float(total_union)

    if va_cites_len > 0.0:
        va_to_ra_topcites_ratio = float(topcite_parity) / float(va_cites_len)

    certainty = max(jaccard_similarity, va_to_ra_topcites_ratio)

    if certainty > 0.14:
        certainty = sqrt(certainty) + 0.1
    else:
        certainty = 0.0

    if jaccard_similarity >= va_to_ra_topcites_ratio:
        bconfig.LOGGER.info(
            "|--> Found %s matching cites out of %s "
            "on the paper. Result: %s%% similarity" % (total_parity, va_cites_len, certainty)
        )
    else:
        bconfig.LOGGER.info(
            "|--> Found %s matching top cites out of %s "
            "on the paper. Result: %s%% similarity" % (topcite_parity, va_cites_len, certainty)
        )

    return min(certainty, 1.0)
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the coauthors of a virtual author with all the coauthors of
    a real author. If a collaboration is detected on both sides, these
    collaboration detachments will be compared as well.

    @param va_id: Virtual author ID
    @type va_id: int
    @param ra_id: Real author ID
    @type ra_id: int

    @return: the probability of the virtual author belonging to the real author
    @rtype: float
    '''
    bconfig.LOGGER.info("|-> Start of coauthorship comparison (va %s : ra %s)"
                  % (va_id, ra_id))

    ra_coauth_set = set()

    ra_coauthors_data = get_realauthor_data(ra_id, "coauthor")
    va_coauth_set = get_information_from_dataset(va_id)
    va_coauth_set_format = set()

#    max_coauthors = int(get_config_parameter('MAX_COAUTHORS')[0])
    max_coauthors = MAX_COAUTHORS

    if (len(ra_coauthors_data) == 0) and (len(va_coauth_set) == 0):
        bconfig.LOGGER.info("|-> End of coauthorship comparison (Sets empty)")
        return 0

    if (len(va_coauth_set) > max_coauthors):
        bconfig.LOGGER.info("|--> Many coauthors found. Will try hash"
                      + " values for collaboration testing.")
        hashed = str(hash_coauthor_set(va_coauth_set))

        for coauthor_data in ra_coauthors_data:
            if coauthor_data['value'].split(";;")[1] == hashed:
                bconfig.LOGGER.info("|---> Hash found! Assuming "
                              "collaboration attachment.")
                return 1.0

        bconfig.LOGGER.info("|---> Hash NOT found. Skipping metric.")
        return 0

    for rcoauthor_data in ra_coauthors_data:
        ra_coauth_set.add(rcoauthor_data['value'].split(";;")[1])

    for vcoauthor_data in va_coauth_set:
        va_coauth_set_format.add(create_unified_name(vcoauthor_data.lower()))

    parity = ra_coauth_set.intersection(va_coauth_set_format)

    certainty = 0

    for collaborationsearch in parity:
        if collaborationsearch.count("ollaboration"):
            bconfig.LOGGER.info("|--> Found matching collaboration: %s"
                          % (collaborationsearch))
            return 1.0

    if len(va_coauth_set) > 0:
        certainty = 1 - exp(-.8 * pow(len(parity), .7))

    bconfig.LOGGER.info("|--> Found %s matching coauthors out of %s "
                        "on the paper. Result: %s%% similarity"
                        % (len(parity), len(va_coauth_set), certainty))

    return certainty