コード例 #1
0
    def testFourTypes(self):
        input = {
            'subject': {
                'Science': 2
            },
            'event': {
                'Party': 1
            },
        }
        expect = {
            'subject': {
                'Science': 2,
                'Party': 1
            },
        }
        self.assertEqual(four_types(input), expect)

        input = {
            'event': {
                'Party': 1
            },
        }
        expect = {
            'subject': {
                'Party': 1
            },
        }
        self.assertEqual(four_types(input), expect)
コード例 #2
0
    def testFourTypes(self):
        input = {
            'subject': { 'Science': 2 },
            'event': { 'Party': 1 },
        }
        expect = {
            'subject': { 'Science': 2, 'Party': 1 },
        }
        self.assertEqual(four_types(input), expect)

        input = {
            'event': { 'Party': 1 },
        }
        expect = {
            'subject': { 'Party': 1 },
        }
        self.assertEqual(four_types(input), expect)
コード例 #3
0
    def testFourTypes(self):
        input = {
            'subject': { 'Science': 2 },
            'event': { 'Party': 1 },
        }
        expect = {
            'subject': { 'Science': 2, 'Party': 1 },
        }
        self.assertEqual(four_types(input), expect)

        input = {
            'event': { 'Party': 1 },
        }
        expect = {
            'subject': { 'Party': 1 },
        }
        self.assertEqual(four_types(input), expect)

        marc = [[
            ('650', ' 0\x1faRhodes, Dan (Fictitious character)\x1fvFiction.\x1e'),
            ('650', ' 0\x1faSheriffs\x1fvFiction.\x1e'),
            ('651', ' 0\x1faTexas\x1fvFiction.\x1e')
        ]]

        expect = {
            'place': {u'Texas': 1},
            'subject': {u'Dan Rhodes (Fictitious character)': 1, u'Sheriffs': 1, u'Sheriffs in fiction': 1, u'Texas in fiction': 1, u'Fiction': 3}
        }

        self.assertEqual(find_subjects(marc), expect)

        marc = [[
            ('650', ' 0\x1faSpies\x1fzFrance\x1fzParis\x1fvFiction.\x1e'),
            ('651', ' 0\x1faFrance\x1fxHistory\x1fyDirectory, 1795-1799\x1fvFiction.\x1e')
        ]]

        expect = {
            'subject': {u'History': 1, u'France in fiction': 1, u'Spies': 1, u'Spies in fiction': 1, u'Fiction': 2},
            'place': {u'Paris': 1, u'France': 2},
            'time': {u'Directory, 1795-1799': 1}
        }

        self.assertEqual(find_subjects(marc), expect)
コード例 #4
0
ファイル: work_post.py プロジェクト: RaceList/openlibrary
def build_doc(w):
    wkey = w['key']

    m = re_work_key.match(wkey)
    wkey_num = int(m.group(1))
    if wkey_num in long_subjects:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_year.search(pub_date)
            if m:
                return m.group(1)
    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    doc = Element("doc")
    add_field(doc, 'key', 'OL%dW' % wkey_num)
    add_field(doc, 'title', w['title'])
    #add_field(doc, 'title_suggest', w['title'])

    has_fulltext = any(e.get('ia', None) for e in editions)
    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if e.get('title', None):
            t = e['title']
            if t != w['title']:
                alt_titles.add(t)
        for f in 'work_titles', 'other_titles':
            if f not in e:
                continue
            assert isinstance(e[f], list)
            for t in e[f]:
                if t != w['title']:
                    alt_titles.add(t)

    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None))
    add_field(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', 'OL%dM' % e['ekey'])
    if wkey_num in covers:
        add_field(doc, 'cover_edition_key', 'OL%dM' % covers[wkey_num])

    k = 'by_statement'
    add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(e['pub_year'] for e in editions if 'pub_year' in e)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set( e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    field_map = [
        ('lccn', 'lccn'),
        ('publishers', 'publisher'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            if db_key == 'publishers':
                e[db_key] = ['Sine nomine' if is_sine_nomine(i) else i for i in e[db_key].split('\t')]
            assert isinstance(e[db_key], list)
            v.update(e[db_key])
        add_field_list(doc, search_key, v)
#        if db_key == 'publishers':
#            add_field_list(doc, search_key + '_facet', v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            if f not in e:
                continue
            assert isinstance(e[f], list)
            for v in e[f]:
                isbn.add(v.replace('-', ''))
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    for e in editions:
        if 'languages' not in e:
            continue
        assert isinstance(e['languages'], list)
        for l in e['languages']:
            for l2 in l.split('\t'):
                if len(l2) != 3:
                    print e['languages']
                
                assert len(l2) == 3
                lang.add(l2)
    if lang:
        add_field_list(doc, 'language', lang)

    goog = set() # google
    non_goog = set()
    for e in editions:
        if 'ia' in e:
            assert isinstance(e['ia'], list)
            for i in e['ia']:
                i = i.strip()
                if i.endswith('goog'):
                    goog.add(i)
                else:
                    non_goog.add(i)
    add_field_list(doc, 'ia', list(non_goog) + list(goog))

    authors = w['authors']
    author_keys = ['OL%dA' % a['akey'] for a in authors]
    author_names = [a.get('name', '') or '' for a in authors]

    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alt_names' not in a:
            continue
        assert isinstance(a['alt_names'], list)
        alt_names.update(a['alt_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))

#    if 'subjects' in w:
#        if isinstance(w['subjects'][0], list):
#            try:
#                subjects = find_subjects(w['subjects'])
#            except ValueError:
#                print w['subjects']
#                raise
#        else:
#            subjects = work_subjects(wkey_num)
#            if not subjects:
#                subjects = {}
#
    if 'marc_subjects' in w:
        try:
            marc_subjects = eval(w['marc_subjects'])
        except:
            print 'error parsing marc subjects (%d)' % len(w['marc_subjects'])
            marc_subjects = []
        try:
            subjects = find_subjects(marc_subjects)
        except ValueError:
            print w['marc_subjects']
            raise

        subjects = four_types(subjects)

        for k in 'person', 'place', 'subject', 'time':
            if k not in subjects:
                continue
            add_field_list(doc, k, subjects[k].keys())
            #add_field_list(doc, k + '_facet', subjects[k].keys())
            subject_keys = [str_to_key(s) for s in subjects[k].keys()]
            add_field_list(doc, k + '_key', subject_keys)

    return doc
コード例 #5
0
ファイル: find_works.py プロジェクト: artmedlar/openlibrary
def find_works(akey, book_iter, existing={}):
    equiv = defaultdict(int) # normalized title and work title pairs
    norm_titles = defaultdict(int) # frequency of titles
    books_by_key = {}
    books = []
    # normalized work title to regular title
    rev_wt = defaultdict(lambda: defaultdict(int))

    print 'find_works'

    for book in book_iter:
        if 'norm_wt' in book:
            pair = (book['norm_title'], book['norm_wt'])
            equiv[pair] += 1
            rev_wt[book['norm_wt']][book['work_title']] +=1
        norm_titles[book['norm_title']] += 1
        books_by_key[book['key']] = book
        books.append(book)

    title_map = build_work_title_map(equiv, norm_titles)

    for a, b in existing.items():
        norm_a = mk_norm(a)
        norm_b = mk_norm(b)
        rev_wt[norm_b][norm_a] +=1
        title_map[norm_a] = norm_b

    works = defaultdict(lambda: defaultdict(list))
    work_titles = defaultdict(list)
    for b in books:
        if 'eng' not in b.get('lang', []) and 'norm_wt' in b:
            work_titles[b['norm_wt']].append(b['key'])
            continue
        n = b['norm_title']
        title = b['title']
        if n in title_map:
            n = title_map[n]
            title = top_rev_wt(rev_wt[n])
        works[n][title].append(b['key'])

    works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()])

    for work_count, norm, w in works:
        first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0]
        titles = defaultdict(int)
        for key_list in w.values():
            for ekey in key_list:
                b = books_by_key[ekey]
                title = b['title']
                titles[title] += 1
        keys = work_titles[norm]
        for values in w.values():
            keys += values
        assert work_count == len(keys)
        title = max(titles.keys(), key=lambda i:titles[i])
        toc_iter = ((k, books_by_key[k].get('table_of_contents', None)) for k in keys)
        toc = dict((k, v) for k, v in toc_iter if v)
        editions = [books_by_key[k] for k in keys]
        subtitles = defaultdict(lambda: defaultdict(int))
        edition_count = 0
        with_subtitle_count = 0
        for e in editions:
            edition_count += 1
            subtitle = e['subtitle'] or ''
            if subtitle != '':
                with_subtitle_count += 1
            norm_subtitle = mk_norm(subtitle)
            if norm_subtitle != norm:
                subtitles[norm_subtitle][subtitle] += 1
        use_subtitle = None
        for k, v in subtitles.iteritems():
            lc_k = k.strip(' .').lower()
            if lc_k in ('', 'roman') or 'edition' in lc_k:
                continue
            num = sum(v.values())
            overall = float(num) / float(edition_count)
            ratio = float(num) / float(with_subtitle_count)
            if overall > 0.2 and ratio > 0.5:
                use_subtitle = freq_dict_top(v)
        w = {'title': first, 'editions': editions}
        if use_subtitle:
            w['subtitle'] = use_subtitle
        if toc:
            w['toc'] = toc
        subjects = four_types(find_subjects(get_marc_subjects(w)))
        if subjects:
            w['subjects'] = subjects
        yield w
コード例 #6
0
ファイル: work_post.py プロジェクト: yzou/openlibrary
def build_doc(w):
    wkey = w['key']

    m = re_work_key.match(wkey)
    wkey_num = int(m.group(1))
    if wkey_num in long_subjects:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    doc = Element("doc")
    add_field(doc, 'key', 'OL%dW' % wkey_num)
    add_field(doc, 'title', w['title'])
    #add_field(doc, 'title_suggest', w['title'])

    has_fulltext = any(e.get('ia', None) for e in editions)
    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if e.get('title', None):
            t = e['title']
            if t != w['title']:
                alt_titles.add(t)
        for f in 'work_titles', 'other_titles':
            if f not in e:
                continue
            assert isinstance(e[f], list)
            for t in e[f]:
                if t != w['title']:
                    alt_titles.add(t)

    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set(e['subtitle'] for e in editions
                        if e.get('subtitle', None)
                        and e['subtitle'] != w.get('subtitle', None))
    add_field(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', 'OL%dM' % e['ekey'])
    if wkey_num in covers:
        add_field(doc, 'cover_edition_key', 'OL%dM' % covers[wkey_num])

    k = 'by_statement'
    add_field_list(doc, k, set(e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(e['pub_year'] for e in editions if 'pub_year' in e)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    field_map = [
        ('lccn', 'lccn'),
        ('publishers', 'publisher'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            if db_key == 'publishers':
                e[db_key] = [
                    'Sine nomine' if is_sine_nomine(i) else i
                    for i in e[db_key].split('\t')
                ]
            assert isinstance(e[db_key], list)
            v.update(e[db_key])
        add_field_list(doc, search_key, v)


#        if db_key == 'publishers':
#            add_field_list(doc, search_key + '_facet', v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            if f not in e:
                continue
            assert isinstance(e[f], list)
            for v in e[f]:
                isbn.add(v.replace('-', ''))
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    for e in editions:
        if 'languages' not in e:
            continue
        assert isinstance(e['languages'], list)
        for l in e['languages']:
            for l2 in l.split('\t'):
                if len(l2) != 3:
                    print e['languages']

                assert len(l2) == 3
                lang.add(l2)
    if lang:
        add_field_list(doc, 'language', lang)

    goog = set()  # google
    non_goog = set()
    for e in editions:
        if 'ia' in e:
            assert isinstance(e['ia'], list)
            for i in e['ia']:
                i = i.strip()
                if i.endswith('goog'):
                    goog.add(i)
                else:
                    non_goog.add(i)
    add_field_list(doc, 'ia', list(non_goog) + list(goog))

    authors = w['authors']
    author_keys = ['OL%dA' % a['akey'] for a in authors]
    author_names = [a.get('name', '') or '' for a in authors]

    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alt_names' not in a:
            continue
        assert isinstance(a['alt_names'], list)
        alt_names.update(a['alt_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet',
                   (' '.join(v) for v in zip(author_keys, author_names)))

    #    if 'subjects' in w:
    #        if isinstance(w['subjects'][0], list):
    #            try:
    #                subjects = find_subjects(w['subjects'])
    #            except ValueError:
    #                print w['subjects']
    #                raise
    #        else:
    #            subjects = work_subjects(wkey_num)
    #            if not subjects:
    #                subjects = {}
    #
    if 'marc_subjects' in w:
        try:
            marc_subjects = eval(w['marc_subjects'])
        except:
            print 'error parsing marc subjects (%d)' % len(w['marc_subjects'])
            marc_subjects = []
        try:
            subjects = find_subjects(marc_subjects)
        except ValueError:
            print w['marc_subjects']
            raise

        subjects = four_types(subjects)

        for k in 'person', 'place', 'subject', 'time':
            if k not in subjects:
                continue
            add_field_list(doc, k, subjects[k].keys())
            #add_field_list(doc, k + '_facet', subjects[k].keys())
            subject_keys = [str_to_key(s) for s in subjects[k].keys()]
            add_field_list(doc, k + '_key', subject_keys)

    return doc
コード例 #7
0
    def testFourTypes(self):
        input = {
            'subject': {
                'Science': 2
            },
            'event': {
                'Party': 1
            },
        }
        expect = {
            'subject': {
                'Science': 2,
                'Party': 1
            },
        }
        self.assertEqual(four_types(input), expect)

        input = {
            'event': {
                'Party': 1
            },
        }
        expect = {
            'subject': {
                'Party': 1
            },
        }
        self.assertEqual(four_types(input), expect)

        marc = [[
            ('650',
             ' 0\x1faRhodes, Dan (Fictitious character)\x1fvFiction.\x1e'),
            ('650', ' 0\x1faSheriffs\x1fvFiction.\x1e'),
            ('651', ' 0\x1faTexas\x1fvFiction.\x1e')
        ]]

        expect = {
            'place': {
                u'Texas': 1
            },
            'subject': {
                u'Dan Rhodes (Fictitious character)': 1,
                u'Sheriffs': 1,
                u'Sheriffs in fiction': 1,
                u'Texas in fiction': 1,
                u'Fiction': 3
            }
        }

        self.assertEqual(find_subjects(marc), expect)

        marc = [[
            ('650', ' 0\x1faSpies\x1fzFrance\x1fzParis\x1fvFiction.\x1e'),
            ('651',
             ' 0\x1faFrance\x1fxHistory\x1fyDirectory, 1795-1799\x1fvFiction.\x1e'
             )
        ]]

        expect = {
            'subject': {
                u'History': 1,
                u'France in fiction': 1,
                u'Spies': 1,
                u'Spies in fiction': 1,
                u'Fiction': 2
            },
            'place': {
                u'Paris': 1,
                u'France': 2
            },
            'time': {
                u'Directory, 1795-1799': 1
            }
        }

        self.assertEqual(find_subjects(marc), expect)