Esempio n. 1
0
def assign_signature_block(recid, json, *args, **kwargs):
    """Assign phonetic block to each signature.

    The method extends the given signature with a phonetic
    notation of the author's full name, based on
    nysiis algorithm. The phonetic block is assigned before
    the signature is indexed by an elasticsearch instance.
    """
    authors = json.get('authors', [])

    for author in authors:
        if 'full_name' in author:
            name = {'author_name': author['full_name']}

            try:
                signature_block = block_phonetic(
                    np.array([name], dtype=np.object).reshape(-1, 1),
                    threshold=0,
                    phonetic_algorithm='nysiis'
                )
            except IndexError as err:
                # Most likely a malformed author name, report & continue
                from flask import current_app
                current_app.logger.exception(err)
                continue

            author['signature_block'] = signature_block[0]
Esempio n. 2
0
def phonetic_blocks(full_names, phonetic_algorithm='nysiis'):
    """Create a dictionary of phonetic blocks for a given list of names."""

    # The method requires a list of dictionaries with full_name as keys.
    full_names_formatted = [{"author_name": i} for i in full_names]

    # Create a list of phonetic blocks.
    phonetic_blocks = list(
        block_phonetic(np.array(full_names_formatted,
                                dtype=np.object).reshape(-1, 1),
                       threshold=0,
                       phonetic_algorithm=phonetic_algorithm))

    return dict(zip(full_names, phonetic_blocks))
Esempio n. 3
0
def phonetic_blocks(full_names, phonetic_algorithm='nysiis'):
    """Create a dictionary of phonetic blocks for a given list of names."""

    # The method requires a list of dictionaries with full_name as keys.
    full_names_formatted = [
        {"author_name": i} for i in full_names]

    # Create a list of phonetic blocks.
    phonetic_blocks = list(
        block_phonetic(np.array(
            full_names_formatted,
            dtype=np.object).reshape(-1, 1),
            threshold=0,
            phonetic_algorithm=phonetic_algorithm
        )
    )

    return dict(zip(full_names, phonetic_blocks))
Esempio n. 4
0
def create_signature_block(author_name):
    """Create signature block for given author_name.

    :param str author_name: author's full name
        Example:
            author_name = "Ellis, John R"

    :return: string representing phonetic block for full_name
        Example:
            u'ELj'
    """
    try:
        name = {'author_name': author_name}
        signature_block = block_phonetic(
            np.array([name], dtype=np.object).reshape(-1, 1),
            threshold=0,
            phonetic_algorithm='nysiis')
        return signature_block[0]
    except (IndexError, KeyError) as err:
        print "Couldn't create signature: {0} in '{1}'".format(err, name)
Esempio n. 5
0
def pair_sampling(blocking_function,
                  blocking_threshold,
                  blocking_phonetic_alg,
                  clusters_filename,
                  train_filename,
                  balanced=1, verbose=1,
                  sample_size=1000000,
                  use_blocking=1):
    """Sampling pairs from the ground-truth data.

    This function builds a pair dataset from claimed signatures.
    It gives the ability to specify the
    blocking function and whether the sampling would be balanced or not.

    Parameters
    ----------
    :param blocking_function: string
        must be a defined blocking function. Defined functions are:
        - "block_last_name_first_initial"
        - "block_phonetic"

    :param blocking_threshold: int or None
        It determines the maximum allowed size of blocking on the last name
        It can only be:
        -   None; if the blocking function is block_last_name_first_initial
        -   int; if the blocking function is block_phonetic
            please check the documentation of phonetic blocking in
            beard.clustering.blocking_funcs.py

    :param blocking_phonetic_alg: string or None
        If not None, determines which phonetic algorithm is used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)

    :param clusters_filename: string
        Path to the input clusters (ground-truth) file

    :param train_filename: string
        Path to train set file

    :param balanced: boolean
        determines if the sampling would be balanced.
        The balance is defined as the same number of pairs with the same name
        on signature and pairs with different names. The balance is preserved
        both in the pairs belonging to one authors and in the pairs belonging
        to different authors. Note that if there are not enough pairs to
        satisfy the balance condition, some of the pairs will be replicated.

    :param verbose: boolean
        determines if some processing statistics would be shown

    :param sample_size: integer
        The desired sample size

    :param use_blocking: boolean
        determines if the signatures should be blocked before sampling

    Returns
    -------
    :returns: list
        list of signature pairs
    """
    # Load ground-truth
    true_clusters = json.load(open(clusters_filename, "r"))
    clusters_reversed = {v: k for k, va in true_clusters.iteritems()
                         for v in va}

    train_signatures = json.load(open(train_filename, "r"))

    if not use_blocking:
        return _noblocking_sampling(sample_size, train_signatures,
                                    clusters_reversed)

    train_signatures_ids = []
    for item in train_signatures:
        train_signatures_ids.append([item])

    train_signatures_ids = np.array(train_signatures_ids)

    if blocking_function == "block_last_name_first_initial":
        blocking = block_last_name_first_initial(train_signatures_ids)
    elif blocking_function == "block_phonetic" and blocking_threshold:
        blocking = block_phonetic(train_signatures_ids,
                                  blocking_threshold,
                                  blocking_phonetic_alg)
    else:
        raise ValueError("No such blocking strategy.")

    category_size = sample_size / 4

    blocking_dict = {}

    for index, b in enumerate(blocking):
        if b in blocking_dict:
            blocking_dict[b].append(index)
        else:
            blocking_dict[b] = [index]

    # 'd' stands for different, 's' stands for same, 'a' stands for author
    # 'n' stands for name
    dasn = []
    sasn = []
    sadn = []
    dadn = []

    for _, sig_s in blocking_dict.iteritems():

        for i, s1 in enumerate(sig_s):
            for s2 in sig_s[i+1:]:
                s1_id = train_signatures[s1]['signature_id']
                s2_id = train_signatures[s2]['signature_id']
                s1_name = train_signatures[s1]['author_name']
                s2_name = train_signatures[s2]['author_name']
                s1_cluster = clusters_reversed[s1_id]
                s2_cluster = clusters_reversed[s2_id]
                if s1_cluster == s2_cluster:
                    # Same author
                    if s1_name == s2_name:
                        sasn.append((s1_id, s2_id, 0))
                    else:
                        sadn.append((s1_id, s2_id, 0))
                else:
                    # Different authors
                    if s1_name == s2_name:
                        dasn.append((s1_id, s2_id, 1))
                    else:
                        dadn.append((s1_id, s2_id, 1))

    if balanced:
        if verbose:
            print("len of dasn:", len(dasn))
            print("len of sadn:", len(sadn))
            print("len of sasn:", len(sasn))
            print("len of dadn:", len(dadn))

        all_pairs = map(lambda x: int(math.ceil(
                        category_size/float(len(x)))) * x,
                        [dasn, sasn, sadn, dadn])
        pairs = reduce(lambda x, y: x + random.sample(y, category_size),
                       all_pairs, [])
    else:
        positive = sasn + sadn
        negative = dasn + dadn
        pairs = random.sample(positive,
                              sample_size/2) + random.sample(negative,
                                                             sample_size/2)

    return pairs