def delayed_cluster_sets_from_marktables(limit_to_surnames=False): # { name -> [(table, bibref)] } bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' % str(limit_to_surnames)) name_buket = {} if limit_to_surnames: limit_to_surnames = set( [generate_last_name_cluster_str(s) for s in limit_to_surnames]) for tab, ref, name in chain(izip(cycle((100, )), *izip(*get_bib10x())), izip(cycle((700, )), *izip(*get_bib70x()))): name = generate_last_name_cluster_str(name) if limit_to_surnames and not name in limit_to_surnames: continue name_buket[name] = name_buket.get(name, []) + [(tab, ref)] bibauthor_print( 'Delayed_cluster_set_from_marktables going to get %s signatures....' % str(len(name_buket))) all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs)))) for name, refs in name_buket.items()) all_refs = sorted(all_refs, key=itemgetter(2)) return ([ delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs ], map(itemgetter(0), all_refs), map(itemgetter(2), all_refs))
def tortoise_last_name(name, wedge_threshold=None, from_mark=True, pure=False): logger.log('Start working on %s' % name) assert not (from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: logger.log(' ... from mark!') clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) logger.log(' ... delayed done') else: logger.log(' ... from pid, pure=%s' % str(pure)) clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) logger.log(' ... delayed pure done!') try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] cluster_set = cluster() logger.log("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) create_matrix(cluster_set, False) wedge_and_store(cluster_set) except (IndexError, ValueError): logger.log("Sorry, %s not found in the last name clusters" % (lname))
def tortoise_tweak_coefficient(lastnames, min_coef, max_coef, stepping, build_matrix=True): bibauthor_print('Coefficient tweaking!') bibauthor_print('Cluster sets from mark...') lnames = set([generate_last_name_cluster_str(n) for n in lastnames]) coefficients = [x/100. for x in range(int(min_coef*100),int(max_coef*100),int(stepping*100))] if build_matrix: schedule_workers(_create_matrix, lnames) schedule_workers(_collect_statistics_lname_coeff, ((x,y) for x in lnames for y in coefficients ))
def delayed_cluster_sets_from_marktables(limit_to_surnames=False): # { name -> [(table, bibref)] } bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' % str(limit_to_surnames)) name_buket = {} if limit_to_surnames: limit_to_surnames = set([generate_last_name_cluster_str(s) for s in limit_to_surnames]) for tab, ref, name in chain(izip(cycle((100,)), *izip(*get_bib10x())), izip(cycle((700,)), *izip(*get_bib70x()))): name = generate_last_name_cluster_str(name) if limit_to_surnames and not name in limit_to_surnames: continue name_buket[name] = name_buket.get(name, []) + [(tab, ref)] bibauthor_print('Delayed_cluster_set_from_marktables going to get %s signatures....' % str(len(name_buket))) all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs)))) for name, refs in name_buket.items()) all_refs = sorted(all_refs, key=itemgetter(2)) return ([delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs], map(itemgetter(0), all_refs), map(itemgetter(2), all_refs))
def delayed_cluster_sets_from_marktables(): # { name -> [(table, bibref)] } name_buket = {} for tab, ref, name in chain(izip(cycle((100,)), *izip(*get_bib10x())), izip(cycle((700,)), *izip(*get_bib70x()))): name = generate_last_name_cluster_str(name) name_buket[name] = name_buket.get(name, []) + [(tab, ref)] all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs)))) for name, refs in name_buket.items()) all_refs = sorted(all_refs, key=itemgetter(2)) return ([delayed_create_from_mark(refs, name) for name, refs, _ in all_refs], map(itemgetter(0), all_refs), map(itemgetter(2), all_refs))
def create_lastname_list_from_personid(last_modification): ''' This function generates a dictionary from a last name to list of personids which have this lastname. ''' # ((personid, [full Name1], Nbibs) ... ) all_names = get_all_modified_names_from_personid(last_modification) # ((personid, last_name, Nbibs) ... ) all_names = ((row[0], generate_last_name_cluster_str(iter(row[1]).next()), row[2]) for row in all_names) # { (last_name, [(personid)... ], Nbibs) ... } all_names = groupby(sorted(all_names, key=itemgetter(1)), key=itemgetter(1)) all_names = ((key, list(data)) for key, data in all_names) all_names = ((key, map(itemgetter(0), data), sum(x[2] for x in data)) for key, data in all_names) return all_names
def tortoise_last_name(name, from_mark=False, pure=False): assert not (from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: clusters, lnames, sizes = delayed_cluster_sets_from_marktables() else: clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) cluster_set = cluster() create_matrix(cluster_set, True) wedge_and_store(cluster_set) except IndexError: bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname))
def tortoise_last_name(name, from_mark=False, pure=False): bibauthor_print('Start working on %s' % name) assert not(from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: bibauthor_print(' ... from mark!') clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) bibauthor_print(' ... delayed done') else: bibauthor_print(' ... from pid, pure') clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) bibauthor_print(' ... delayed pure done!') # try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] cluster_set = cluster() bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) create_matrix(cluster_set, True) wedge_and_store(cluster_set)
def tortoise_last_name(name, from_mark=True, pure=False): bibauthor_print('Start working on %s' % name) assert not(from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: bibauthor_print(' ... from mark!') clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) bibauthor_print(' ... delayed done') else: bibauthor_print(' ... from pid, pure=%s'%str(pure)) clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) bibauthor_print(' ... delayed pure done!') # try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] cluster_set = cluster() bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) create_matrix(cluster_set, False) wedge_and_store(cluster_set)
def test_generate_last_name_cluster(self): str_to_check = 'Surnameone Surnametwo, Name' self.assertEquals(generate_last_name_cluster_str(str_to_check), 'surnameonesurnametwo')