def assign_signature_block(recid, json, *args, **kwargs): """Assign phonetic block to each signature. The method extends the given signature with a phonetic notation of the author's full name, based on nysiis algorithm. The phonetic block is assigned before the signature is indexed by an elasticsearch instance. """ authors = json.get('authors', []) for author in authors: if 'full_name' in author: name = {'author_name': author['full_name']} try: signature_block = block_phonetic( np.array([name], dtype=np.object).reshape(-1, 1), threshold=0, phonetic_algorithm='nysiis' ) except IndexError as err: # Most likely a malformed author name, report & continue from flask import current_app current_app.logger.exception(err) continue author['signature_block'] = signature_block[0]
def phonetic_blocks(full_names, phonetic_algorithm='nysiis'): """Create a dictionary of phonetic blocks for a given list of names.""" # The method requires a list of dictionaries with full_name as keys. full_names_formatted = [{"author_name": i} for i in full_names] # Create a list of phonetic blocks. phonetic_blocks = list( block_phonetic(np.array(full_names_formatted, dtype=np.object).reshape(-1, 1), threshold=0, phonetic_algorithm=phonetic_algorithm)) return dict(zip(full_names, phonetic_blocks))
def phonetic_blocks(full_names, phonetic_algorithm='nysiis'): """Create a dictionary of phonetic blocks for a given list of names.""" # The method requires a list of dictionaries with full_name as keys. full_names_formatted = [ {"author_name": i} for i in full_names] # Create a list of phonetic blocks. phonetic_blocks = list( block_phonetic(np.array( full_names_formatted, dtype=np.object).reshape(-1, 1), threshold=0, phonetic_algorithm=phonetic_algorithm ) ) return dict(zip(full_names, phonetic_blocks))
def create_signature_block(author_name): """Create signature block for given author_name. :param str author_name: author's full name Example: author_name = "Ellis, John R" :return: string representing phonetic block for full_name Example: u'ELj' """ try: name = {'author_name': author_name} signature_block = block_phonetic( np.array([name], dtype=np.object).reshape(-1, 1), threshold=0, phonetic_algorithm='nysiis') return signature_block[0] except (IndexError, KeyError) as err: print "Couldn't create signature: {0} in '{1}'".format(err, name)
def pair_sampling(blocking_function, blocking_threshold, blocking_phonetic_alg, clusters_filename, train_filename, balanced=1, verbose=1, sample_size=1000000, use_blocking=1): """Sampling pairs from the ground-truth data. This function builds a pair dataset from claimed signatures. It gives the ability to specify the blocking function and whether the sampling would be balanced or not. Parameters ---------- :param blocking_function: string must be a defined blocking function. Defined functions are: - "block_last_name_first_initial" - "block_phonetic" :param blocking_threshold: int or None It determines the maximum allowed size of blocking on the last name It can only be: - None; if the blocking function is block_last_name_first_initial - int; if the blocking function is block_phonetic please check the documentation of phonetic blocking in beard.clustering.blocking_funcs.py :param blocking_phonetic_alg: string or None If not None, determines which phonetic algorithm is used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) :param clusters_filename: string Path to the input clusters (ground-truth) file :param train_filename: string Path to train set file :param balanced: boolean determines if the sampling would be balanced. The balance is defined as the same number of pairs with the same name on signature and pairs with different names. The balance is preserved both in the pairs belonging to one authors and in the pairs belonging to different authors. Note that if there are not enough pairs to satisfy the balance condition, some of the pairs will be replicated. :param verbose: boolean determines if some processing statistics would be shown :param sample_size: integer The desired sample size :param use_blocking: boolean determines if the signatures should be blocked before sampling Returns ------- :returns: list list of signature pairs """ # Load ground-truth true_clusters = json.load(open(clusters_filename, "r")) clusters_reversed = {v: k for k, va in true_clusters.iteritems() for v in va} train_signatures = json.load(open(train_filename, "r")) if not use_blocking: return _noblocking_sampling(sample_size, train_signatures, clusters_reversed) train_signatures_ids = [] for item in train_signatures: train_signatures_ids.append([item]) train_signatures_ids = np.array(train_signatures_ids) if blocking_function == "block_last_name_first_initial": blocking = block_last_name_first_initial(train_signatures_ids) elif blocking_function == "block_phonetic" and blocking_threshold: blocking = block_phonetic(train_signatures_ids, blocking_threshold, blocking_phonetic_alg) else: raise ValueError("No such blocking strategy.") category_size = sample_size / 4 blocking_dict = {} for index, b in enumerate(blocking): if b in blocking_dict: blocking_dict[b].append(index) else: blocking_dict[b] = [index] # 'd' stands for different, 's' stands for same, 'a' stands for author # 'n' stands for name dasn = [] sasn = [] sadn = [] dadn = [] for _, sig_s in blocking_dict.iteritems(): for i, s1 in enumerate(sig_s): for s2 in sig_s[i+1:]: s1_id = train_signatures[s1]['signature_id'] s2_id = train_signatures[s2]['signature_id'] s1_name = train_signatures[s1]['author_name'] s2_name = train_signatures[s2]['author_name'] s1_cluster = clusters_reversed[s1_id] s2_cluster = clusters_reversed[s2_id] if s1_cluster == s2_cluster: # Same author if s1_name == s2_name: sasn.append((s1_id, s2_id, 0)) else: sadn.append((s1_id, s2_id, 0)) else: # Different authors if s1_name == s2_name: dasn.append((s1_id, s2_id, 1)) else: dadn.append((s1_id, s2_id, 1)) if balanced: if verbose: print("len of dasn:", len(dasn)) print("len of sadn:", len(sadn)) print("len of sasn:", len(sasn)) print("len of dadn:", len(dadn)) all_pairs = map(lambda x: int(math.ceil( category_size/float(len(x)))) * x, [dasn, sasn, sadn, dadn]) pairs = reduce(lambda x, y: x + random.sample(y, category_size), all_pairs, []) else: positive = sasn + sadn negative = dasn + dadn pairs = random.sample(positive, sample_size/2) + random.sample(negative, sample_size/2) return pairs