def find_and_process_updates(process_initials):
    """
    Finds and processes not updated virtualauthors (which are identified by
    the 'updated' tag) and delivers the ID of this virtualauthor to the
    function responsible for assigning the virtualauthor to a realauthor.

    @param process_initials: If names with initials only shall be
        processed or not
    @type process_initials: boolean
    """
    if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
        init_va_process_queue()

    while True:
        va_id = -1

        if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
            bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.")
            break
        else:
            va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get()

        va_name = bibauthorid_virtualauthor_utils.get_virtualauthor_records(va_id, tag="orig_name_string")[0]["value"]

        if not process_initials:
            if bibauthorid_utils.split_name_parts(va_name)[2]:
                (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated"))
                bconfig.LOGGER.log(25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name))
                add_virtualauthor(va_id)
        else:
            (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated"))
            bconfig.LOGGER.log(25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name))
            add_virtualauthor(va_id)
def find_and_process_orphans(iterations=1):
    """
    Finds and processes orphaned virtual authors.

    @param iterations: Number of rounds to do this processing
    @type iterations: int
    """
    multi_attach = False
    #    processed_orphans = set()

    for iteration in xrange(iterations):
        if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
            init_va_process_queue(mode="orphaned")

        while True:
            va_id = -1

            if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
                bconfig.LOGGER.debug("Empty Queue. Job finished." " Nothing to do.")
                break
            else:
                va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get()

            #            if va_id not in dat.PROCESSED_ORPHANS:
            va_name = bibauthorid_virtualauthor_utils.get_virtualauthor_records(va_id, tag="orig_name_string")[0][
                "value"
            ]
            bconfig.LOGGER.log(25, "|> Inserting orphaned VA: %s Name: %s" % (va_id, va_name))

            if (bconfig.ATTACH_VA_TO_MULTIPLE_RAS) and (iteration == iterations - 1):
                multi_attach = True

            add_virtualauthor(va_id, multi_attach)
Exemple #3
0
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the origin names of a virtual author against the name list of
    a real author

    @param va_id: ID of the virtual author
    @type va_id: int
    @param ra_id: ID of the real author
    @type ra_id: int

    @return: The probability resulting from the name comparison.
    @rtype: float
    '''

    bconfig.LOGGER.info("|-> Start of name comparison (va %s : ra %s)"
                  % (va_id, ra_id))

    ra_names = get_realauthor_names_from_set(ra_id)
    va_nameid_recs = get_virtualauthor_records(va_id, tag='orig_authorname_id')
#    print "RA Names: ", ra_names
#    print "VA Name: ", va_name

    authorname_id = -1
    if va_nameid_recs:
        authorname_id = va_nameid_recs[0]['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    if not authorname_strings["name"]:
        return 0.0

    comparisons = []

    for ra_name in ra_names:
        comparison = compare_names(authorname_strings["name"], ra_name)
        bconfig.LOGGER.info("|-> %s & %s -> %s"
                            % (authorname_strings["name"],
                               ra_name, comparison))
        comparisons.append(comparison)

    #print "checking ",name_1," against ", name_2

    bconfig.LOGGER.debug("|--> Name comparisons: %s" % (comparisons))
    bconfig.LOGGER.info("|-> End of name comparison")

#    ret = average(comparisons)
    ret = float(sum(comparisons)) / len(comparisons)

    if ret < .1:
        ret = 0 #.1

    bconfig.LOGGER.info("|--> Resulting name probability: %s" % (ret))

    return ret
def compare_va_to_ra(va_id, ra_id):
    """
    Compares the origin names of a virtual author against the name list of
    a real author

    @param va_id: ID of the virtual author
    @type va_id: int
    @param ra_id: ID of the real author
    @type ra_id: int

    @return: The probability resulting from the name comparison.
    @rtype: float
    """

    bconfig.LOGGER.info("|-> Start of name comparison (va %s : ra %s)" % (va_id, ra_id))

    ra_names = get_realauthor_names_from_set(ra_id)
    va_nameid_recs = get_virtualauthor_records(va_id, tag="orig_authorname_id")
    #    print "RA Names: ", ra_names
    #    print "VA Name: ", va_name

    authorname_id = -1
    if va_nameid_recs:
        authorname_id = va_nameid_recs[0]["value"]

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    if not authorname_strings["name"]:
        return 0.0

    comparisons = []

    for ra_name in ra_names:
        comparison = compare_names(authorname_strings["name"], ra_name)
        bconfig.LOGGER.info("|-> %s & %s -> %s" % (authorname_strings["name"], ra_name, comparison))
        comparisons.append(comparison)

    # print "checking ",name_1," against ", name_2

    bconfig.LOGGER.debug("|--> Name comparisons: %s" % (comparisons))
    bconfig.LOGGER.info("|-> End of name comparison")

    #    ret = average(comparisons)
    ret = float(sum(comparisons)) / len(comparisons)

    if ret < 0.1:
        ret = 0  # .1

    bconfig.LOGGER.info("|--> Resulting name probability: %s" % (ret))

    return ret
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the data
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    if dat.RUNTIME_CONFIG["populate_aid_from_personid"]:
        return True

    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading info for va %s: %s recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))

    inspireid = get_field_values_on_condition(
        bibrec_id, ['100', '700'], 'i', 'a',
        authorname_strings["db_name"], "==")

    if inspireid:
        inspireid = list(inspireid)[0]

    if ra_id > -1:
        if inspireid:
            set_realauthor_data(ra_id, "inspireid", "%s" % inspireid)

        return True
    else:
        return inspireid
Exemple #6
0
def create_realauthors_from_orphans():
    '''
    Find all orphaned virtual authors and create a real author for every one.
    '''
    va_list = bibauthorid_virtualauthor_utils.get_orphan_virtualauthors()

    for va_entry in va_list:
        bconfig.LOGGER.log(
            25, "INSERTING VA %s Name: %s" %
            (va_entry['virtualauthorid'],
             bibauthorid_virtualauthor_utils.get_virtualauthor_records(
                 va_entry['virtualauthorid'],
                 tag='orig_name_string')[0]['value']))
        add_virtualauthor(va_entry['virtualauthorid'])

    bconfig.LOGGER.debug("va_list lengtht: %s" % (len(va_list)))
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the citations
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    if dat.RUNTIME_CONFIG["populate_aid_from_personid"]:
        return True

    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    bconfig.LOGGER.info("| Reading citation info for va %s: %s recid %s"
                  % (va_id, authorname_id, bibrec_id))

    cites = get_field_values_on_condition(bibrec_id, 'cites')

    if ra_id > -1:
        if cites:
            for cite in cites:
                set_realauthor_data(ra_id, "outgoing_citation", "%s" % (cite))

        return True
    else:
        return cites
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the data
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading info for va %s: %s recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))

    data = get_field_values_on_condition(
        bibrec_id, ['100', '700'], 'a', 'a',
        authorname_strings["db_name"], "!=")

    if ra_id > -1:
        formatted = "something"
        set_realauthor_data(ra_id, "module_tag", "module_value %s"
                            % (formatted))

        return True
    else:
        return data
Exemple #9
0
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the data
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading info for va %s: %s recid %s" %
                        (va_id, authorname_strings["name"], bibrec_id))

    data = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a', 'a',
                                         authorname_strings["db_name"], "!=")

    if ra_id > -1:
        formatted = "something"
        set_realauthor_data(ra_id, "module_tag",
                            "module_value %s" % (formatted))

        return True
    else:
        return data
def create_realauthors_from_orphans():
    """
    Find all orphaned virtual authors and create a real author for every one.
    """
    va_list = bibauthorid_virtualauthor_utils.get_orphan_virtualauthors()

    for va_entry in va_list:
        bconfig.LOGGER.log(
            25,
            "INSERTING VA %s Name: %s"
            % (
                va_entry["virtualauthorid"],
                bibauthorid_virtualauthor_utils.get_virtualauthor_records(
                    va_entry["virtualauthorid"], tag="orig_name_string"
                )[0]["value"],
            ),
        )
        add_virtualauthor(va_entry["virtualauthorid"])

    bconfig.LOGGER.debug("va_list lengtht: %s" % (len(va_list)))
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the currently processed paper with the list of already attributed
    papers of the real author. Should the currently processed paper be
    amongst the list of papers of the real author, the returned value will be
    1--the highest probability. And 0 otherwise.

    Due to the configuration of this function in the configuration file,
    a parity of the papers will nullify the entire calculation.

    @param va_id: ID of the virtual author
    @type va_id: int
    @param ra_id: ID of the real author
    @type ra_id: int

    @return: The probability resulting from the paper equality comparison.
    @rtype: float
    '''
    va_records_raw = get_virtualauthor_records(va_id, "bibrec_id")
    ra_records_raw = get_realauthor_data(ra_id, "bibrec_id")
    paper_parity = 0
    va_records = []
    ra_records = []

    for i in va_records_raw:
        va_records.append(i['value'])

    for i in ra_records_raw:
        ra_records.append(i['value'])

    for va_record in va_records:
        if va_record in ra_records:
            paper_parity += 1

    if paper_parity > 0:
        bconfig.LOGGER.warn("|-> Paper parity detected"
                      + " -> Impossibility of author equality")
        return 1.0
    else:
        return 0.0
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the citations
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the data will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of the data
    @rtype: True if ra_id > -1 or list of strings
    '''
    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    bconfig.LOGGER.info("| Reading citation info for va %s: %s recid %s" %
                        (va_id, authorname_id, bibrec_id))

    cites = get_field_values_on_condition(bibrec_id, 'cites')

    if ra_id > -1:
        if cites:
            for cite in cites:
                set_realauthor_data(ra_id, "outgoing_citation", "%s" % (cite))

        return True
    else:
        return cites
def compare_va_to_ra(va_id, ra_id):
    '''
    Compares the currently processed paper with the list of already attributed
    papers of the real author. Should the currently processed paper be
    amongst the list of papers of the real author, the returned value will be
    1--the highest probability. And 0 otherwise.

    Due to the configuration of this function in the configuration file,
    a parity of the papers will nullify the entire calculation.

    @param va_id: ID of the virtual author
    @type va_id: int
    @param ra_id: ID of the real author
    @type ra_id: int

    @return: The probability resulting from the paper equality comparison.
    @rtype: float
    '''
    va_records_raw = get_virtualauthor_records(va_id, "bibrec_id")
    ra_records_raw = get_realauthor_data(ra_id, "bibrec_id")
    paper_parity = 0
    va_records = []
    ra_records = []

    for i in va_records_raw:
        va_records.append(i['value'])

    for i in ra_records_raw:
        ra_records.append(i['value'])

    for va_record in va_records:
        if va_record in ra_records:
            paper_parity += 1

    if paper_parity > 0:
        bconfig.LOGGER.warn("|-> Paper parity detected" +
                            " -> Impossibility of author equality")
        return 1.0
    else:
        return 0.0
Exemple #14
0
def find_and_process_updates(process_initials):
    '''
    Finds and processes not updated virtualauthors (which are identified by
    the 'updated' tag) and delivers the ID of this virtualauthor to the
    function responsible for assigning the virtualauthor to a realauthor.

    @param process_initials: If names with initials only shall be
        processed or not
    @type process_initials: boolean
    '''
    if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
        init_va_process_queue()

    while True:
        va_id = -1

        if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
            bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.")
            break
        else:
            va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get()

        va_name = (bibauthorid_virtualauthor_utils.get_virtualauthor_records(
            va_id, tag='orig_name_string')[0]['value'])

        if not process_initials:
            if bibauthorid_utils.split_name_parts(va_name)[2]:
                (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(
                    va_id, 'updated'))
                bconfig.LOGGER.log(
                    25, "|> Inserting VA:" + " %s Orig. name: %s" %
                    (va_id, va_name))
                add_virtualauthor(va_id)
        else:
            (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(
                va_id, 'updated'))
            bconfig.LOGGER.log(
                25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name))
            add_virtualauthor(va_id)
Exemple #15
0
def find_and_process_orphans(iterations=1):
    '''
    Finds and processes orphaned virtual authors.

    @param iterations: Number of rounds to do this processing
    @type iterations: int
    '''
    multi_attach = False
    #    processed_orphans = set()

    for iteration in xrange(iterations):
        if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
            init_va_process_queue(mode="orphaned")

        while True:
            va_id = -1

            if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty():
                bconfig.LOGGER.debug("Empty Queue. Job finished."
                                     " Nothing to do.")
                break
            else:
                va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get()


#            if va_id not in dat.PROCESSED_ORPHANS:
            va_name = (
                bibauthorid_virtualauthor_utils.get_virtualauthor_records(
                    va_id, tag='orig_name_string')[0]['value'])
            bconfig.LOGGER.log(
                25, "|> Inserting orphaned VA: %s Name: %s" % (va_id, va_name))

            if ((bconfig.ATTACH_VA_TO_MULTIPLE_RAS)
                    and (iteration == iterations - 1)):
                multi_attach = True

            add_virtualauthor(va_id, multi_attach)
def add_virtualauthor(va_id, multi_va_to_ra=False, get_raid_from_personid_table=False):
    '''
    Adds a new virtual author to the real authors system:
    the idea is to search for possibly compatible real authors, then compare
    the compatibility of this virtual author with all the virtual authors
    connected to the selected real authors and add the new virtualauthor to
    the most compatible real author. In case we do not have a most compatible
    real author, we add the same virtual author to more then one real author
    with a lower probability; this behavior might be changed.

    @param va_id: Virtualauthor ID
    @type va_id: int
    '''
    addstart = time.time()
    adding_threshold = bconfig.REALAUTHOR_VA_ADD_THERSHOLD

    if adding_threshold == ["-1"]:
        adding_threshold = 0.7

    already_existing = get_realauthors_by_virtuala_id(va_id)
    ralist = []

    if len(already_existing) <= 0:
        start = time.time()

        va_cluster = (bibauthorid_virtualauthor_utils.
                      get_cluster_va_ids_from_va_id(va_id))
        ralist_raw = []

        va_hash = hash(str(va_cluster))

        if not get_raid_from_personid_table:
            if va_hash in dat.RA_VA_CACHE:
                ralist_raw = dat.RA_VA_CACHE[va_hash]
                bconfig.LOGGER.debug("|-> Cache Hit for va cluster")
            else:
                bconfig.LOGGER.debug("|-> Cache Fail--Generating new hash")
                ralist_raw = update_ralist_cache(va_cluster, va_hash)

            ralist = [ids['ra_id'] for ids in ralist_raw if ids['va_id'] != va_id]
            ralist = list(set(ralist))
        else:
            ralist = pidu.get_personid_from_paper(get_virtualauthor_records(va_id, tag="bibrefrecpair")[0]['value'])
            if ralist < 0:
                update_ralist_cache(va_cluster, va_hash)
                return
            add_realauthor_va(ralist, va_id, 1)
            update_ralist_cache(va_cluster, va_hash)
            bconfig.LOGGER.log(25, "|-> Adding to real author #%s"
                               " with a compatability."
                               % (ralist))
            (bibauthorid_virtualauthor_utils.
             update_virtualauthor_record(va_id, 'connected', 'True'))
            (bibauthorid_virtualauthor_utils.
             delete_virtualauthor_record(va_id, 'updated'))
            return


        if len(ralist) > 0:
            min_compatibilities = []

            for i in ralist:
                compatibilities = []
                compatibilities.append(cmp_virtual_to_real_author(va_id, i))
                min_compatibilities.append(min(compatibilities))

            max_min_compatibilities = max(min_compatibilities)

            if max_min_compatibilities < adding_threshold:
                bconfig.LOGGER.log(25, "|-> Creating NEW real author for this"
                      + " virtual author (compatibility below adding threshold"
                      + " of other RAs).")
                create_new_realauthor(va_id)
                update_ralist_cache(va_cluster, va_hash)

            else:
                if min_compatibilities.count(max_min_compatibilities) == 1:
                    index = min_compatibilities.index(max_min_compatibilities)
                    add_realauthor_va(ralist[index], va_id,
                                      max_min_compatibilities)
                    bconfig.LOGGER.log(25, "|-> Adding to real author #%s"
                               " with a compatability of %.2f"
                               % (ralist[index], max_min_compatibilities))

                elif min_compatibilities.count(max_min_compatibilities) > 1:
                    if multi_va_to_ra:
                        bconfig.LOGGER.log(25, "|-> virtual author"
                                " comaptible with more than one realauthor.")
                        indexes = set()

                        for i in xrange(len(min_compatibilities)):
                            indexes.add(min_compatibilities.index(
                                                max_min_compatibilities, i))

                        bconfig.LOGGER.log(25, "|-> virtual author"
                                " will be attached to %s real authors"
                                % (len(indexes)))

                        for i in indexes:
                            add_realauthor_va(ralist[i], va_id,
                                      max_min_compatibilities)
                            bconfig.LOGGER.log(25, "|--> Adding to real author"
                               " #%s with a compatability of %.2f"
                               % (ralist[i], max_min_compatibilities))

                    else:
                        bconfig.LOGGER.log(25, "|-> virtual author"
                                " comaptible with more than one realauthor..."
                                "skipped for now.")
                        bconfig.LOGGER.log(25, "|> The (skipped) comparison "
                                  "with %s real authors took %.2fs" %
                                  (len(ralist), time.time() - start))
                        (bibauthorid_virtualauthor_utils.
                         update_virtualauthor_record(va_id, 'connected',
                                                     'False'))
                        (bibauthorid_virtualauthor_utils.
                         delete_virtualauthor_record(va_id, 'updated'))
                    return
        else:
            bconfig.LOGGER.log(25, "|-> Creating NEW real author for this"
                        " Virtual Author (currently, no real author exists)")
            create_new_realauthor(va_id)
            update_ralist_cache(va_cluster, va_hash)

    (bibauthorid_virtualauthor_utils.
     update_virtualauthor_record(va_id, 'connected', 'True'))
    (bibauthorid_virtualauthor_utils.
     delete_virtualauthor_record(va_id, 'updated'))

    bconfig.LOGGER.log(25, "|> The comparison with %s real authors took %.2fs"
                  % (len(ralist), time.time() - addstart))
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''
    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len(
            [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(
            25, "Creating minimal virtual authors for "
            "all loaded docs (%s)" % (num_docs))

        for docs in [
                row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids
        ]:
            for author_id in docs['authornameids']:
                author_name = [
                    an['name'] for an in dat.AUTHOR_NAMES
                    if an['id'] == author_id
                ]
                refrecs = [
                    ref[1] for ref in docs['authornameid_bibrefrec']
                    if ref[0] == author_id
                ]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe',
                        timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records." %
                              (len(recently_modified)),
                              stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout,
                                  verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout,
                              verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                          stream=sys.stdout,
                          verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'],
                                                    "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s" % (rec))
                continue

            author_in_list = [
                row for row in authors if row['db_name'] == rec_author
            ]

            if author_in_list:
                for upd in [
                        row for row in authors if row['db_name'] == rec_author
                ]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({
                    'db_name': rec_author,
                    'records': [rec],
                    'last_name': last_name
                })

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [
            row for row in authors if row['last_name'] == author_last_name
        ]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors))
        bibtask.write_message(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors),
            stream=sys.stdout,
            verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(
            25, "-- Relevant data successfully read into memory"
            " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [
                row['id'] for row in dat.AUTHOR_NAMES
                if row['db_name'] == current_author['db_name']
            ]

            if not authornamesid:
                bconfig.LOGGER.error(
                    "The author '%s' rec '%s' is not in authornames "
                    "and will be skipped. You might want "
                    "to run authornames update before?" %
                    (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped." %
                                     (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(
                            va_id, "orig_authorname_id")

                        for an_list in [
                                row['authornameids'] for row in dat.DOC_LIST
                                if row['bibrecid'] == rec
                        ]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(
            25, "-- Computation finished. Will write back to "
            "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id, ))

    bconfig.LOGGER.log(
        25, "Will now run personid update to make the "
        "changes visible also on the front end and to "
        "create person IDs for %s newly created and changed "
        "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(
        25, "Done updating everything. Thanks for flying "
        "with bibauthorid!")
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the coauthors/collaboration attachment
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the coauthors will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of coauthors OR the name of a
        collaboration
    @rtype: True if ra_id > -1 or list of strings or string
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = -1

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))

    coauthors = get_field_values_on_condition(
                                        bibrec_id, ['100', '700'], 'a', 'a',
                                        authorname_strings["db_name"], "!=")

    collaboration = get_field_values_on_condition(bibrec_id, "710", "g")

    if (not coauthors) and (not collaboration):
        bconfig.LOGGER.info("|-> No coauthors and no collaboration found "
                            "for this author on this record")
    elif not ra_id:
        if collaboration:
            bconfig.LOGGER.info("|-> Collaboration found: %s"
                          % (list(collaboration)[0]))
        else:
            bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors)))

    max_coauthors = MAX_COAUTHORS

    if ra_id > -1:
        if collaboration:
            cname = list(collaboration)[0]
            coauthor_formatted = create_unified_name(cname.lower())
            set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                % (authorname_strings["name"],
                                   coauthor_formatted))
        else:
            if len(coauthors) <= max_coauthors:
                for coauthor in coauthors:
                    coauthor_formatted = create_unified_name(coauthor.lower())
                    set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                    % (authorname_strings["name"],
                                       coauthor_formatted))
            else:
                hashvalue = hash_coauthor_set(coauthors)
                bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve"
                                    " information, a hash will be stored: %s"
                                    % (max_coauthors, hashvalue))
                set_realauthor_data(ra_id, "coauthor", "%s;;%s"
                                    % (authorname_strings["name"],
                                       hashvalue))

        return True
    else:
        if collaboration:
            return collaboration
        else:
            return coauthors
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the coauthors/collaboration attachment
    of a virtual author from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the coauthors will be returned.

    @param va_id: Virtual author ID to get the information from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: True, if ra_id is set OR A list of coauthors OR the name of a
        collaboration
    @rtype: True if ra_id > -1 or list of strings or string
    '''
    va_data = get_virtualauthor_records(va_id)
    bibrec_id = ""
    authorname_id = -1

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)

    bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s" %
                        (va_id, authorname_strings["name"], bibrec_id))

    coauthors = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a',
                                              'a',
                                              authorname_strings["db_name"],
                                              "!=")

    collaboration = get_field_values_on_condition(bibrec_id, "710", "g")

    if (not coauthors) and (not collaboration):
        bconfig.LOGGER.info("|-> No coauthors and no collaboration found "
                            "for this author on this record")
    elif not ra_id:
        if collaboration:
            bconfig.LOGGER.info("|-> Collaboration found: %s" %
                                (list(collaboration)[0]))
        else:
            bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors)))

    max_coauthors = MAX_COAUTHORS

    if ra_id > -1:
        if collaboration:
            cname = list(collaboration)[0]
            coauthor_formatted = create_unified_name(cname.lower())
            set_realauthor_data(
                ra_id, "coauthor",
                "%s;;%s" % (authorname_strings["name"], coauthor_formatted))
        else:
            if len(coauthors) <= max_coauthors:
                for coauthor in coauthors:
                    coauthor_formatted = create_unified_name(coauthor.lower())
                    set_realauthor_data(
                        ra_id, "coauthor", "%s;;%s" %
                        (authorname_strings["name"], coauthor_formatted))
            else:
                hashvalue = hash_coauthor_set(coauthors)
                bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve"
                                    " information, a hash will be stored: %s" %
                                    (max_coauthors, hashvalue))
                set_realauthor_data(
                    ra_id, "coauthor",
                    "%s;;%s" % (authorname_strings["name"], hashvalue))

        return True
    else:
        if collaboration:
            return collaboration
        else:
            return coauthors
def get_information_from_dataset(va_id, ra_id=-1):
    '''
    Retrieves information about the affiliation of a virtual author
    from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the affiliations will be returned.

    @param va_id: Virtual author ID to get the info from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: A list of affiliations or simply True, if ra_id is set.
    @rtype: list of strings or True if ra_id > -1
    '''

    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)
    bconfig.LOGGER.info("| Reading affiliations for va %s: %s  recid %s" %
                        (va_id, authorname_strings["name"], bibrec_id))
    affiliations = get_field_values_on_condition(bibrec_id, ['100', '700'],
                                                 'u', 'a',
                                                 authorname_strings["db_name"])
    record_date = get_field_values_on_condition(bibrec_id, '269', 'c')
    constructed_date = []
    datearray = []

    if len(record_date) > 0:
        datearray = list(record_date)[0].split("-")
    else:
        datearray = ['0000', '00']

    length = len(datearray)

    if length == 3:
        datearray.pop()
        constructed_date = datearray
    elif length == 2:
        constructed_date = datearray
    else:
        constructed_date = datearray
        constructed_date += ['10']

    affiliation_date = "%s-%s" % (constructed_date[0], constructed_date[1])

    is_aff = False
    is_aff_date = False

    if not affiliations:
        bconfig.LOGGER.info("|-> No Affiliation for this record. Set to None")
        affiliations = ["None"]
    else:
        bconfig.LOGGER.info("|-> Affiliation found: %s" % (affiliations))
        is_aff = True

    if affiliation_date == "0000-00":
        bconfig.LOGGER.info("|-> No Affiliation Date set to 0000-00")
    else:
        bconfig.LOGGER.info("|-> Affiliation date: %s" % (affiliation_date))
        is_aff_date = True

    aff_collection = []

    if is_aff or is_aff_date:
        for affiliation in affiliations:
            bconfig.LOGGER.info(
                "|--> Found Affiliation: %s;;%s;;%s" %
                (affiliation_date, authorname_strings["name"], affiliation))
            aff_collection.append(
                "%s;;%s;;%s" %
                (affiliation_date, authorname_strings["name"], affiliation))

    if ra_id > -1:
        for affiliation in aff_collection:
            set_realauthor_data(ra_id, "affiliation", affiliation)

        return True
    else:
        return aff_collection
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''

    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len([row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(25, "Creating minimal virtual authors for "
                                "all loaded docs (%s)"
                                % (num_docs))

        for docs in [row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids]:
            for author_id in docs['authornameids']:
                author_name = [an['name'] for an in dat.AUTHOR_NAMES
                               if an['id'] == author_id]
                refrecs = [ref[1] for ref in docs['authornameid_bibrefrec']
                           if ref[0] == author_id]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
                                                        date=last_log[0][2])
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe',
                    timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records."
                              % (len(recently_modified)), stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout, verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout, verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                                stream=sys.stdout, verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s"
                                     % (rec))
                continue

            author_in_list = [row for row in authors
                              if row['db_name'] == rec_author]

            if author_in_list:
                for upd in [row for row in authors
                            if row['db_name'] == rec_author]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({'db_name': rec_author,
                                'records': [rec],
                                'last_name': last_name})

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [row for row in authors
                           if row['last_name'] == author_last_name]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress('Processing %s of %s cluster: "%s" '
                                     '(%s authors)'
                                     % (status + 1, total_lnames,
                                        author_last_name, total_authors))
        bibtask.write_message('Processing %s of %s cluster: "%s" '
                              '(%s authors)'
                              % (status + 1, total_lnames, author_last_name,
                                 total_authors), stream=sys.stdout, verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory"
                               " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [row['id'] for row in dat.AUTHOR_NAMES
                             if row['db_name'] == current_author['db_name']]

            if not authornamesid:
                bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames "
                                     "and will be skipped. You might want "
                                     "to run authornames update before?"
                                     % (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped."
                                     % (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(va_id,
                                                        "orig_authorname_id")

                        for an_list in [row['authornameids'] for row in
                                    dat.DOC_LIST if row['bibrecid'] == rec]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(25, "-- Computation finished. Will write back to "
                               "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id,))

    bconfig.LOGGER.log(25, "Will now run personid update to make the "
                       "changes visible also on the front end and to "
                       "create person IDs for %s newly created and changed "
                       "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying "
                       "with bibauthorid!")
def get_information_from_dataset(va_id, ra_id= -1):
    '''
    Retrieves information about the affiliation of a virtual author
    from the data set.

    In dependency of the real author ID, the information will be written to the
    real author holding this ID. If the real author ID should be the default
    '-1', a list with all the affiliations will be returned.

    @param va_id: Virtual author ID to get the info from
    @type va_id: int
    @param ra_id: Real author ID to set information for.
    @type ra_id: int

    @return: A list of affiliations or simply True, if ra_id is set.
    @rtype: list of strings or True if ra_id > -1
    '''
    src = "MEM"

    if bconfig.STANDALONE or dat.RUNTIME_CONFIG["populate_aid_from_personid"]:
        src = "API"

    va_data = get_virtualauthor_records(va_id)
    authorname_id = -1
    bibrec_id = ""

    for va_data_item in va_data:
        if va_data_item['tag'] == "bibrec_id":
            bibrec_id = va_data_item['value']
        elif va_data_item['tag'] == "orig_authorname_id":
            authorname_id = va_data_item['value']

    authorname_strings = get_name_and_db_name_strings(authorname_id)
    bconfig.LOGGER.info("| Reading affiliations for va %s: %s  recid %s"
                  % (va_id, authorname_strings["name"], bibrec_id))
    affiliations = get_field_values_on_condition(
                                        bibrec_id, ['100', '700'], 'u', 'a',
                                        authorname_strings["db_name"], source=src)
    record_date = get_field_values_on_condition(bibrec_id, '269', 'c', source=src)
    constructed_date = []
    datearray = []

    if len(record_date) > 0:
        datearray = list(record_date)[0].split("-")
    else:
        datearray = ['0000', '00']

    length = len(datearray)

    if length == 3:
        datearray.pop()
        constructed_date = datearray
    elif length == 2:
        constructed_date = datearray
    else:
        constructed_date = datearray
        constructed_date += ['10']

    affiliation_date = "%s-%s" % (constructed_date[0], constructed_date[1])

    is_aff = False
    is_aff_date = False

    if not affiliations:
        bconfig.LOGGER.info("|-> No Affiliation for this record. Set to None")
        affiliations = ["None"]
    else:
        bconfig.LOGGER.info("|-> Affiliation found: %s" % (affiliations))
        is_aff = True

    if affiliation_date == "0000-00":
        bconfig.LOGGER.info("|-> No Affiliation Date set to 0000-00")
    else:
        bconfig.LOGGER.info("|-> Affiliation date: %s" % (affiliation_date))
        is_aff_date = True

    aff_collection = []

    if is_aff or is_aff_date:
        for affiliation in affiliations:
            bconfig.LOGGER.info("|--> Found Affiliation: %s;;%s;;%s"
                          % (affiliation_date, authorname_strings["name"],
                             affiliation))
            aff_collection.append("%s;;%s;;%s" % (affiliation_date,
                                                  authorname_strings["name"],
                                                  affiliation))

    if ra_id > -1:
        for affiliation in aff_collection:
            set_realauthor_data(ra_id, "affiliation", affiliation)

        return True
    else:
        return aff_collection