Ejemplo n.º 1
0
def solr_updates(i):
    global subjects_to_update, authors_to_update
    t0 = time()
    d = i['data']
    changeset = d['changeset']
    print 'author:', d['author']
    try:
        assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data']
    except:
        print d['changeset']
        raise
    master_key = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    assert dup_keys
    print d['changeset']
    print 'timestamp:', i['timestamp']
    print 'dups:', dup_keys
    print 'records to update:', len(d['result'])
     
    master = None
    obj_by_key = {}
    works = []
    editions_by_work = defaultdict(list)
    for obj in d['query']:
        obj_type = obj['type']['key']
        k = obj['key']
        if obj_type == '/type/work':
            works.append(obj['key'])
        elif obj_type == '/type/edition':
            if 'works' not in obj:
                continue
            for w in obj['works']:
                editions_by_work[w['key']].append(obj)
        obj_by_key[k] = obj
    master = obj_by_key.get(master_key)
    #print 'master:', master

    if len(d['result']) == 0:
        print i

    work_updates = []
    for wkey in works:
            #print 'editions_by_work:', editions_by_work
            work = obj_by_key[wkey]
            work['editions'] = editions_by_work[wkey]
            subjects = get_work_subjects(work)
            for subject_type, values in subjects.iteritems():
                subjects_to_update.update((subject_type, v) for v in values)
            ret = update_work(work, obj_cache=obj_by_key, debug=True)
            work_updates += ret
    solr_update(work_updates, debug=False, index='works')

    authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master})
    print 'authors to update:', len(authors_to_update)

    t1 = time() - t0
    update_times.append(t1)
    print 'update takes: %d seconds' % t1
    print
Ejemplo n.º 2
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update),
                                                   len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type'][
                    'key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update(
                        (subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print('update author:', repr(akey))
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print('akey:', repr(akey))
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True)

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 3
0
def run_update():
    global authors_to_update, works_to_update
    subjects_to_update = set()
    global last_update
    print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update))
    if works_to_update:
        requests = []
        num = 0
        total = len(works_to_update)
        for wkey in works_to_update:
            num += 1
            print 'update work: %s %d/%d' % (wkey, num, total)
            if '/' in wkey[7:]:
                print 'bad wkey:', wkey
                continue
            work_to_update = withKey(wkey)
            for attempt in range(5):
                try:
                    requests += update_work(work_to_update)
                except AuthorRedirect:
                    print 'fixing author redirect'
                    w = ol.get(wkey)
                    need_update = False
                    for a in w['authors']:
                        r = ol.get(a['author'])
                        if r['type'] == '/type/redirect':
                            a['author'] = {'key': r['location']}
                            need_update = True
                    if need_update:
                        if not done_login:
                            rc = read_rc()
                            ol.login('EdwardBot', rc['EdwardBot'])
                        ol.save(w['key'], w, 'avoid author redirect')
            if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'):
                subjects = get_work_subjects(work_to_update)
                print subjects
                for subject_type, values in subjects.iteritems():
                    subjects_to_update.update((subject_type, v) for v in values)
                if len(requests) >= 100:
                    solr_update(requests, debug=True)
                    requests = []
    #            if num % 1000 == 0:
    #                solr_update(['<commit/>'], debug=True)
        if requests:
            solr_update(requests, debug=True)
        if not args.no_commit:
            solr_update(['<commit/>'], debug=True)
    last_update = time()
    if not args.no_author_updates and authors_to_update:
        requests = []
        for akey in authors_to_update:
            print 'update author:', `akey`
            try:
                request = update_author(akey)
                if request:
                    requests += request
            except AttributeError:
                print 'akey:', `akey`
                raise
        if not args.no_commit:
            solr_update(requests + ['<commit/>'], index='authors', debug=True)
    subject_add = Element("add")
    print subjects_to_update
    for subject_type, subject_name in subjects_to_update:
        key = subject_type + '/' + subject_name
        count = subject_count(subject_type, subject_name)

        if not subject_need_update(key, count):
            print 'no updated needed:', (subject_type, subject_name, count)
            continue
        print 'updated needed:', (subject_type, subject_name, count)

        doc = Element("doc")
        add_field(doc, 'key', key)
        add_field(doc, 'name', subject_name)
        add_field(doc, 'type', subject_type)
        add_field(doc, 'count', count)
        subject_add.append(doc)

    if len(subject_add):
        print 'updating subjects'
        add_xml = tostring(subject_add).encode('utf-8')
        solr_update([add_xml, '<commit />'], debug=True, index='subjects')

    authors_to_update = set()
    works_to_update = set()
    subjects_to_update = set()
    print >> open(state_file, 'w'), offset
Ejemplo n.º 4
0
def solr_updates(i):
    global subjects_to_update, authors_to_update
    t0 = time()
    d = i['data']
    changeset = d['changeset']
    print 'author:', d['author']
    try:
        assert len(changeset['data']) == 2 and 'master' in changeset['data'] and 'duplicates' in changeset['data']
    except:
        print d['changeset']
        raise
    master_key = changeset['data']['master']
    dup_keys = changeset['data']['duplicates']
    assert dup_keys
    print 'timestamp:', i['timestamp']
    print 'dups:', dup_keys
    print 'records to update:', len(d['result'])

    master = None
    obj_by_key = {}
    works = []
    editions_by_work = defaultdict(list)
    for obj in d['query']:
        obj_type = obj['type']['key']
        k = obj['key']
        if obj_type == '/type/work':
            works.append(obj['key'])
        elif obj_type == '/type/edition':
            if 'works' not in obj:
                continue
            for w in obj['works']:
                editions_by_work[w['key']].append(obj)
        obj_by_key[k] = obj
    master = obj_by_key.get(master_key)
    #print 'master:', master

    if len(d['result']) == 0:
        print i

    work_updates = []
    for wkey in works:
            #print 'editions_by_work:', editions_by_work
            work = obj_by_key[wkey]
            work['editions'] = editions_by_work[wkey]
            subjects = get_work_subjects(work)
            for subject_type, values in subjects.iteritems():
                subjects_to_update.update((subject_type, v) for v in values)
            try:
                ret = update_work(work, obj_cache=obj_by_key, debug=True)
            except AuthorRedirect:
                work = withKey(wkey)
                work['editions'] = editions_by_work[wkey]
                ret = update_work(work, debug=True, resolve_redirects=True)
            work_updates += ret
    if work_updates:
        solr_update(work_updates, debug=False, index='works')

    authors_to_update.append({ 'redirects': dup_keys, 'master_key': master_key, 'master': master})
    print 'authors to update:', len(authors_to_update)

    t1 = time() - t0
    update_times.append(t1)
    print 'update takes: %d seconds' % t1
    print