Ejemplo n.º 1
0
def delayed_cluster_sets_from_marktables(limit_to_surnames=False):
    # { name -> [(table, bibref)] }
    bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' %
                    str(limit_to_surnames))

    name_buket = {}
    if limit_to_surnames:
        limit_to_surnames = set(
            [generate_last_name_cluster_str(s) for s in limit_to_surnames])

    for tab, ref, name in chain(izip(cycle((100, )), *izip(*get_bib10x())),
                                izip(cycle((700, )), *izip(*get_bib70x()))):
        name = generate_last_name_cluster_str(name)
        if limit_to_surnames and not name in limit_to_surnames:
            continue
        name_buket[name] = name_buket.get(name, []) + [(tab, ref)]

    bibauthor_print(
        'Delayed_cluster_set_from_marktables going to get %s  signatures....' %
        str(len(name_buket)))

    all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs))))
                for name, refs in name_buket.items())
    all_refs = sorted(all_refs, key=itemgetter(2))
    return ([
        delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs
    ], map(itemgetter(0), all_refs), map(itemgetter(2), all_refs))
Ejemplo n.º 2
0
def tortoise_last_name(name, wedge_threshold=None, from_mark=True, pure=False):
    logger.log('Start working on %s' % name)
    assert not (from_mark and pure)
    lname = generate_last_name_cluster_str(name)

    if from_mark:
        logger.log(' ... from mark!')
        clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
        logger.log(' ... delayed done')
    else:
        logger.log(' ... from pid, pure=%s' % str(pure))
        clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure)
        logger.log(' ... delayed pure done!')

    try:
        idx = lnames.index(lname)
        cluster = clusters[idx]
        size = sizes[idx]
        cluster_set = cluster()
        logger.log("Found, %s(%s). Total number of bibs: %d." %
                   (name, lname, size))
        create_matrix(cluster_set, False)
        wedge_and_store(cluster_set)
    except (IndexError, ValueError):
        logger.log("Sorry, %s not found in the last name clusters" % (lname))
Ejemplo n.º 3
0
def tortoise_tweak_coefficient(lastnames, min_coef, max_coef, stepping, build_matrix=True):
    bibauthor_print('Coefficient tweaking!')
    bibauthor_print('Cluster sets from mark...')

    lnames = set([generate_last_name_cluster_str(n) for n in lastnames])
    coefficients = [x/100. for x in range(int(min_coef*100),int(max_coef*100),int(stepping*100))]


    if build_matrix:
        schedule_workers(_create_matrix, lnames)
    schedule_workers(_collect_statistics_lname_coeff, ((x,y) for x in lnames for y in coefficients ))
Ejemplo n.º 4
0
def tortoise_tweak_coefficient(lastnames, min_coef, max_coef, stepping, build_matrix=True):
    bibauthor_print('Coefficient tweaking!')
    bibauthor_print('Cluster sets from mark...')

    lnames = set([generate_last_name_cluster_str(n) for n in lastnames])
    coefficients = [x/100. for x in range(int(min_coef*100),int(max_coef*100),int(stepping*100))]


    if build_matrix:
        schedule_workers(_create_matrix, lnames)
    schedule_workers(_collect_statistics_lname_coeff, ((x,y) for x in lnames for y in coefficients ))
def delayed_cluster_sets_from_marktables(limit_to_surnames=False):
    # { name -> [(table, bibref)] }
    bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' % str(limit_to_surnames))

    name_buket = {}
    if limit_to_surnames:
        limit_to_surnames = set([generate_last_name_cluster_str(s) for s in limit_to_surnames])

    for tab, ref, name in chain(izip(cycle((100,)), *izip(*get_bib10x())),
                                izip(cycle((700,)), *izip(*get_bib70x()))):
        name = generate_last_name_cluster_str(name)
        if limit_to_surnames and not name in limit_to_surnames:
            continue
        name_buket[name] = name_buket.get(name, []) + [(tab, ref)]

    bibauthor_print('Delayed_cluster_set_from_marktables going to get %s  signatures....' % str(len(name_buket)))

    all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs))))
                for name, refs in name_buket.items())
    all_refs = sorted(all_refs, key=itemgetter(2))
    return ([delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs],
             map(itemgetter(0), all_refs),
             map(itemgetter(2), all_refs))
Ejemplo n.º 6
0
def delayed_cluster_sets_from_marktables():
    # { name -> [(table, bibref)] }
    name_buket = {}
    for tab, ref, name in chain(izip(cycle((100,)), *izip(*get_bib10x())),
                                izip(cycle((700,)), *izip(*get_bib70x()))):
        name = generate_last_name_cluster_str(name)
        name_buket[name] = name_buket.get(name, []) + [(tab, ref)]

    all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs))))
                for name, refs in name_buket.items())
    all_refs = sorted(all_refs, key=itemgetter(2))
    return ([delayed_create_from_mark(refs, name) for name, refs, _ in all_refs],
             map(itemgetter(0), all_refs),
             map(itemgetter(2), all_refs))
def create_lastname_list_from_personid(last_modification):
    '''
    This function generates a dictionary from a last name
    to list of personids which have this lastname.
    '''
    # ((personid, [full Name1], Nbibs) ... )
    all_names = get_all_modified_names_from_personid(last_modification)

    # ((personid, last_name, Nbibs) ... )
    all_names = ((row[0], generate_last_name_cluster_str(iter(row[1]).next()), row[2])
                  for row in all_names)

    # { (last_name, [(personid)... ], Nbibs) ... }
    all_names = groupby(sorted(all_names, key=itemgetter(1)), key=itemgetter(1))
    all_names = ((key, list(data)) for key, data in all_names)
    all_names = ((key, map(itemgetter(0), data), sum(x[2] for x in data)) for key, data in all_names)

    return all_names
Ejemplo n.º 8
0
def create_lastname_list_from_personid(last_modification):
    '''
    This function generates a dictionary from a last name
    to list of personids which have this lastname.
    '''
    # ((personid, [full Name1], Nbibs) ... )
    all_names = get_all_modified_names_from_personid(last_modification)

    # ((personid, last_name, Nbibs) ... )
    all_names = ((row[0], generate_last_name_cluster_str(iter(row[1]).next()), row[2])
                  for row in all_names)

    # { (last_name, [(personid)... ], Nbibs) ... }
    all_names = groupby(sorted(all_names, key=itemgetter(1)), key=itemgetter(1))
    all_names = ((key, list(data)) for key, data in all_names)
    all_names = ((key, map(itemgetter(0), data), sum(x[2] for x in data)) for key, data in all_names)

    return all_names
Ejemplo n.º 9
0
def tortoise_last_name(name, from_mark=False, pure=False):
    assert not (from_mark and pure)

    lname = generate_last_name_cluster_str(name)

    if from_mark:
        clusters, lnames, sizes = delayed_cluster_sets_from_marktables()
    else:
        clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure)

    try:
        idx = lnames.index(lname)
        cluster = clusters[idx]
        size = sizes[idx]
        bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size))
        cluster_set = cluster()
        create_matrix(cluster_set, True)
        wedge_and_store(cluster_set)
    except IndexError:
        bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname))
Ejemplo n.º 10
0
def tortoise_last_name(name, from_mark=False, pure=False):
    bibauthor_print('Start working on %s' % name)
    assert not(from_mark and pure)

    lname = generate_last_name_cluster_str(name)

    if from_mark:
        bibauthor_print(' ... from mark!')
        clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
        bibauthor_print(' ... delayed done')
    else:
        bibauthor_print(' ... from pid, pure')
        clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure)
        bibauthor_print(' ... delayed pure done!')

#    try:
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    cluster_set = cluster()
    bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size))
    create_matrix(cluster_set, True)
    wedge_and_store(cluster_set)
Ejemplo n.º 11
0
def tortoise_last_name(name, from_mark=True, pure=False):
    bibauthor_print('Start working on %s' % name)
    assert not(from_mark and pure)

    lname = generate_last_name_cluster_str(name)

    if from_mark:
        bibauthor_print(' ... from mark!')
        clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
        bibauthor_print(' ... delayed done')
    else:
        bibauthor_print(' ... from pid, pure=%s'%str(pure))
        clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure)
        bibauthor_print(' ... delayed pure done!')

#    try:
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    cluster_set = cluster()
    bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size))
    create_matrix(cluster_set, False)
    wedge_and_store(cluster_set)
Ejemplo n.º 12
0
 def test_generate_last_name_cluster(self):
     str_to_check = 'Surnameone Surnametwo, Name'
     self.assertEquals(generate_last_name_cluster_str(str_to_check),
                       'surnameonesurnametwo')