def query(isbn): """Query the classify.oclc service for metadata.""" xml = wquery(SERVICE_URL.format(isbn=isbn), user_agent=UA, data_checker=None, parser=noparser) data = parser_edit(xml) if not data: data = parser_work(xml) if not data: # pragma: no cover return {} data['year'] = data.get('hyr', u('')) or data.get('lyr', u('')) return _records(isbn, data) oclc = data.get('oclc', u('')) if oclc: data2 = wquery(SERVICE_URL2.format(oclc=oclc), user_agent=UA, data_checker=None, parser=parser_pub) if not data2: # pragma: no cover return {} buf = data2.get('Publisher', u('')).split(':')[1] publisher, year = buf.split(',') data['publisher'] = publisher.strip() data['year'] = RE_YEAR.search(year.strip('. ')).group(0) return _records(isbn, data)
def test_newfilename(): """Test new filename generation""" metadata = { 'Title': 'A Dictionary Of The Internet', 'Authors': ['Darrel Ince', 'Oxford University Press'], 'Publisher': 'Oxford University Press', 'ISBN-13': '9780199571444', 'Language': 'eng', 'Year': '2009' } assert_equals( newfilename(metadata, '{authorsLastNames}_{year}_{title}_{isbn}.epub'), 'Ince,Press_2009_A Dictionary Of ' 'The Internet_9780199571444.epub') assert_equals( newfilename(metadata, '{authorsFullNames}_{publisher}_{language}'), 'Darrel Ince,Oxford University Press_Oxford University Press_eng') assert_equals( newfilename(metadata, 'myfile_{year} {authorsLastNames}.pdf'), 'myfile_2009 Ince,Press.pdf') assert_equals(newfilename(metadata, 'myfile_{nokey}'), None) assert_equals( newfilename(metadata, '{authorsFullNames}: {title}'), 'Darrel Ince,Oxford University Press: A ' 'Dictionary Of The Internet') assert_equals(newfilename(metadata, 'myfile.pdf'), 'myfile.pdf') metadata['Publisher'] = u('') assert_equals( newfilename(metadata, pattern='{authorsFullNames}_{publisher}_{language}'), 'Darrel Ince,Oxford University Press_UNKNOWN_eng') metadata['Title'] = u('') assert_equals(newfilename(metadata), None)
def newfilename(metadata, pattern=PATTERN): """Return a new file name created from book metadata.""" pattern = pattern if pattern else PATTERN for key in metadata.keys(): if not metadata[key]: metadata[key] = u('UNKNOWN') d = { 'authorsFullNames': ','.join(metadata['Authors']), 'year': metadata['Year'], 'publisher': metadata['Publisher'], 'title': metadata['Title'], 'language': metadata['Language'], 'isbn': metadata['ISBN-13'], } if d['title'] == u('UNKNOWN') or d['isbn'] == u('UNKNOWN'): LOGGER.critical('Not enough metadata') return None d['title'] = cleannewname(d['title']) cutoff = min(len(d['title']), CUTOFF) d['title'] = ' '.join(cutoff_tokens(d['title'].split(' '), cutoff)) authorslastnames = [ last_first(authorname)['last'] for authorname in metadata['Authors'] ] d['authorsLastNames'] = ','.join(authorslastnames) d['firstAuthorLastName'] = authorslastnames[0] try: formatted = u(pattern).format(**d) return cleannewname(formatted) except KeyError as e: LOGGER.warning('Error with placeholder: %s', e) return None
def test_newfilename(): """Test new filename generation""" metadata = {'Title': 'A Dictionary Of The Internet', 'Authors': ['Darrel Ince', 'Oxford University Press'], 'Publisher': 'Oxford University Press', 'ISBN-13': '9780199571444', 'Language': 'eng', 'Year': '2009'} assert_equals(newfilename( metadata, '{authorsLastNames}_{year}_{title}_{isbn}.epub'), 'Ince,Press_2009_A Dictionary Of ' 'The Internet_9780199571444.epub') assert_equals( newfilename(metadata, '{authorsFullNames}_{publisher}_{language}'), 'Darrel Ince,Oxford University Press_Oxford University Press_eng') assert_equals(newfilename(metadata, 'myfile_{year} {authorsLastNames}.pdf'), 'myfile_2009 Ince,Press.pdf') assert_equals(newfilename(metadata, 'myfile_{nokey}'), None) assert_equals(newfilename(metadata, '{authorsFullNames}: {title}'), 'Darrel Ince,Oxford University Press: A ' 'Dictionary Of The Internet') assert_equals(newfilename(metadata, 'myfile.pdf'), 'myfile.pdf') metadata['Publisher'] = u('') assert_equals(newfilename(metadata, pattern='{authorsFullNames}_{publisher}_{language}'), 'Darrel Ince,Oxford University Press_UNKNOWN_eng') metadata['Title'] = u('') assert_equals(newfilename(metadata), None)
def parser_mcues(data): """Parse the response from the MCU service. The input data is the result webpage in html from the search.""" data = re.split('\n', data) # split into lines for loop recs = {} recs['Authors'] = [ ] # this should be an array, otherwise stdmeta gives a NotValidMetadataError try: for line in data: line = line.replace('\n', ' ') # remove carriage return if len(recs) == 4: # skip the rest of the file if we have all recs break # Author: # <strong>Garc<ED>a M<E1>rquez, Gabriel (1928- )</strong> elif re.search(r"\s{10}<strong>.+</strong>", line): authors = re.findall('>.+<', line)[0] authors = u( authors.replace('>', '').replace('<', '').split('(')[0]) recs['Authors'].append(authors) # Publisher: #<a href="/webISBN/editorialDetalle.do?sidEditorial=2399&action=busquedaInicial&noValidating=true&POS=0&MAX=50&TOTAL=0&prev_layout=busquedaisbn&layout=busquedaeditoriales&language=es" tabindex="107">Ediciones C<E1>tedra, S.A.</a> elif re.search('tabindex=\"107\">', line): publisher = re.findall('>.+<', line)[0] recs['Publisher'] = u( publisher.replace('>', '').replace('<', '')) # Title: #<a href="/webISBN/tituloDetalle.do?sidTitul=384067&action=busquedaInicial&noValidating=true&POS=0&MAX=50&TOTAL=0&prev_layout=busquedaisbn&layout=busquedaisbn&language=es" tabindex="106">Cien a<F1>os de soledad</a> elif re.search('tabindex=\"106\">', line): title = re.findall('>.+<', line)[0] recs['Title'] = u(title.replace('>', '').replace('<', '')) # Publication year: # (1987) </strong> #elif re.search('\ \;\ \;\(\d{4}\)\ \;\ \;', line): elif re.search(r'\(\d{4}\)', line): recs['Year'] = u(re.findall(r'\d{4}', line)[0]) elif line == '': continue except IndexError: LOGGER.debug('Check the parsing for Spanish MCU (possible error!)') try: # delete almost empty records if not recs['Title'] and not recs['Authors']: recs = {} except KeyError: recs = {} return recs
def _mapper(isbn, records): """Make records canonical. canonical: ISBN-13, Title, Authors, Publisher, Year, Language """ # handle special case if not records: # pragma: no cover return {} # add ISBN-13 records['ISBN-13'] = u(isbn) # call stdmeta for extra cleaning and validation return stdmeta(records)
def parser_pub(htmlthing): """RE parser for classify.oclc service (publisher and year).""" match = RE_PUB.search(u(htmlthing)) if match: try: buf = match.group() flds = RE_FP.findall(buf) vals = RE_VP.findall(buf) return dict(zip(flds, vals)) except Exception: # pragma: no cover pass return None
def parser_edit(xmlthing): """RE parser for classify.oclc service (edition branch).""" match = RE_EDIT.search(u(xmlthing)) if match: try: buf = match.group() flds = RE_FLDS.findall(buf) vals = RE_VALS.findall(buf) return dict(zip(flds, vals)) except Exception: # pragma: no cover pass return None
def newfilename(metadata, pattern=PATTERN): """Return a new file name created from book metadata.""" pattern = pattern if pattern else PATTERN for key in metadata.keys(): if not metadata[key]: metadata[key] = u('UNKNOWN') d = { 'authorsFullNames': ','.join(metadata['Authors']), 'year': metadata['Year'], 'publisher': metadata['Publisher'], 'title': metadata['Title'], 'language': metadata['Language'], 'isbn': metadata['ISBN-13'], } if d['title'] == u('UNKNOWN') or d['isbn'] == u('UNKNOWN'): LOGGER.critical('Not enough metadata') return None # Drop subtitle from title (if available) d['title'] = cleannewname(d['title']).split(' - ')[0] cutoff = min(len(d['title']), CUTOFF) d['title'] = ' '.join(cutoff_tokens(d['title'].split(' '), cutoff)) authorslastnames = [ last_first(authorname)['last'] for authorname in metadata['Authors'] ] d['authorsLastNames'] = ','.join(authorslastnames) d['firstAuthorLastName'] = authorslastnames[0] try: formatted = u(pattern).format(**d) return cleannewname(formatted) except KeyError as e: LOGGER.warning('Error with placeholder: %s', e) return None
def _mapper(isbn, records): """Mapp: canonical <- records.""" # canonical: ISBN-13, Title, Authors, Publisher, Year, Language canonical = {} try: canonical['ISBN-13'] = u(isbn) canonical['Title'] = records.get('title', u('')).replace(' :', ':') buf = records.get('author', u('')) canonical['Authors'] = [_clean(x) for x in buf.split('|')] canonical['Publisher'] = records.get('publisher', u('')) canonical['Year'] = records.get('year', u('')) canonical['Language'] = records.get('lang', u('')) except IndexError: # pragma: no cover LOGGER.debug("RecordMappingError for %s with data %s", isbn, records) return canonical # call stdmeta for extra cleanning and validation return stdmeta(canonical)
def parser_dnb(data): """Parse the response from the DNB service. The input data is the result webpage in html from the search.""" data = re.split('<tr>', data) # split rows in table into lines for loop recs = {} recs['Authors'] = [] try: for line in data: line = line.replace('\n', ' ').replace('\t', '') if len(recs) == 5: # skip the rest of the file if we have all recs break # Author: #<td width="25%" ><strong>Person(en)</strong></td> #<td >Bayerl, Linda (Verfasser)<br/>Dengl, Sabine (Illustrator)</td></tr> # Sometimes they also contain an href on the name and sometimes they start with <td class='yellow'> elif re.search(r"<strong>Person.+</strong>", line): authors = re.findall('</td>(.*)</td', line)[0] authors = authors.replace('<td >', '') authors = re.split('<br/>', authors) # several authors? for auth in authors: if 'href' in auth: # name contains link auth = re.findall(r'<a href=".*" >(.*)</a>', auth)[0] # Remove job description in brackets after the name: auth = u(re.sub(r'\(.*?\)', '', auth)) recs['Authors'].append(auth) # Publisher: #<strong>Verlag</strong></td><td >Hamburg : Carlsen</td> #</tr><tr><td width="25%" class='yellow'><strong>... elif re.search(r"<strong>Verlag</strong>", line): publisher = re.findall('td .*>(.*)</td', line)[0] # get only the publisher's name if ':' in publisher: publisher = publisher.split(':')[1].strip() recs['Publisher'] = u(publisher) # Title: #<td width="25%" class='yellow'><strong>Titel</strong> #</td><td class='yellow'>Kindergartenblock - Verbinden, vergleichen, Fehler finden ab 4 Jahre / Linda Bayerl</td></tr> elif re.search(r"<strong>Titel</strong", line): title = re.findall('td .*>(.*)/.*</td', line)[0] title = u(title.replace('td >', '').replace('</td', '')) recs['Title'] = u(title) # Publication year: #<td width="25%" class='yellow'><strong>Zeitliche Einordnung</strong> #</td><td class='yellow'>Erscheinungsdatum: 2015</td></tr> elif re.search(r"<strong>Zeitliche Einordnung</strong", line): recs['Year'] = u(re.findall(r'\d{4}', line)[0]) # Language: #<tr><td class="yellow" width="25%"> <strong>Sprache(n)</strong> #</td> <td class="yellow"> Deutsch (ger) </td></tr> #</td> <td class="yellow"> Englisch (eng), Neugriechisch (gre) </td></tr> elif re.search(r"<strong>Sprache\(n\)</strong", line): # There can be more than one language, so match all possible cases: langs = re.findall(r'>* \((.*?)\)', line) # list of matches language = ','.join(langs) recs['Language'] = u(language) elif line == '': continue except IndexError: LOGGER.debug('Check the parsing for German DNB (possible error!)') try: # delete almost empty records if not recs['Title'] and not recs['Authors']: recs = {} except KeyError: recs = {} return recs
def setup_module(): create_files([u(TESTFILE), u('./a-deleteme-PLEASE.pdf')]) os.chdir(os.path.dirname(TESTFILE)) create_files(FISBN+[F11])
""" nose tests """ WINDOWS = os.name == 'nt' TESTFILE = './a-deleteme.pdf' NEW_BASENAME = 'a-deleteme-PLEASE.pdf' F1 = '9780321534965.pdf' F2 = '9781597499644.pdf' F3 = '9781852330729.pdf' F4 = '9787500117018.pdf' F5 = '9789727576807.pdf' F6 = u('Campos2011_Emergências obstétricas_9789727576807.pdf') F7 = u('Knuth2008_The Art Of Computer Programming_9780321534965.pdf') F8 = u('Man2001_Genetic Algorithms Concepts And Designs_9781852330729.pdf') F9 = u("O'Connor2012_Violent Python A Cookbook for Hackers, Forensic Analysts, Penetra_9781597499644.pdf") F10 = u('海明威2007_Lao ren yu hai_9787500117018.pdf') F11 = 'myfile.pdf' FISBN = [F1, F2, F3, F4, F5] FFT = [F6, F7, F8, F9, F10] FILES = FISBN + FFT + [F11] PATT0 = "{firstAuthorLastName}{year}_{title}_{isbn}" PATT1 = "{year}_{title}_{isbn}" PATT2 = "{isbn}"
def setup_module(): create_files([u(TESTFILE), u('./a-deleteme-PLEASE.pdf')]) os.chdir(os.path.dirname(TESTFILE)) create_files(FISBN + [F11])
""" nose tests """ WINDOWS = os.name == 'nt' TESTFILE = './a-deleteme.pdf' NEW_BASENAME = 'a-deleteme-PLEASE.pdf' F1 = '9780321534965.pdf' F2 = '9781597499644.pdf' F3 = '9781852330729.pdf' F4 = '9787500117018.pdf' F5 = '9789727576807.pdf' F6 = u('Campos2011_Emergências obstétricas_9789727576807.pdf') F7 = u('Knuth2008_The Art Of Computer Programming_9780321534965.pdf') F8 = u('Man2001_Genetic Algorithms Concepts And Designs_9781852330729.pdf') F9 = u( "O'Connor2012_Violent Python A Cookbook for Hackers, Forensic Analysts, Penetra_9781597499644.pdf" ) F10 = u('海明威2007_Lao ren yu hai_9787500117018.pdf') F11 = 'myfile.pdf' FISBN = [F1, F2, F3, F4, F5] FFT = [F6, F7, F8, F9, F10] FILES = FISBN + FFT + [F11] PATT0 = "{firstAuthorLastName}{year}_{title}_{isbn}" PATT1 = "{year}_{title}_{isbn}"
def parser_sbn(data): """Parse the response from the SBN service. The input data is the result webpage in html from the search. We request the Unimarc record, which contains html entities (accents such as ò) We need to use the above dictionary to convert the html entity to an iso-8859-1 character. The Unimarc entry tends to be more complete than the MARC21 result in the tests we ran on SBN, that is why we chose it. The document link below gives the Unimarc architecture: https://archive.ifla.org/VI/8/unimarc-concise-bibliographic-format-2008.pdf""" recs = {} recs['Authors'] = [] try: data = data.replace('\n', ' ').replace('\t', '') data = re.findall('<li>LEADER(.*)</ul', data)[0] data = re.split('<li>', data) # split into lines for loop for line in data: # Convert html entities (like accents) to iso-8859-1: for isoent, htmlent in DICT_ISO8859_TO_HTML.items(): line = line.replace(htmlent, isoent) # Author: # <li>700 1$aDi Matteo$b, Nino$3IT\ICCU\CAGV\748340</li> # <li>701 1$aLodato$b, Saverio$3IT\ICCU\CFIV\025147</li> if (re.search(r"^70", line) and len(recs['Authors']) == 0): #TODO: remove the len()==0, and deal with duplicate entries in 461 # do a lazy match from $a until the first $ sign: surname = re.findall(r'\$a(.+?)\$', line)[0] name = re.findall(r'\$b(.+?)\$', line)[0] author = u(surname + name) recs['Authors'].append(author) # Publisher and Publication year:: # <li>210 $aMilano$cChiarelettere$d2018</li> elif re.search(r"^210", line): publisher = re.findall(r'\$c(.+?)\$', line)[0] recs['Publisher'] = u(publisher) # sometimes there is a space between $d and the year: year = re.findall(r'\$d.*(\d{4})', line)[0] recs['Year'] = u(year) # Title: # 200 1 $aGiuro che non avrò piu fame$el'Italia della ricostruzione$fAldo Cazzullo # $a is the main title, $e is a subtitle and $f is author elif re.search(r"^200", line): title = re.findall(r'\$a(.*)\$f', line)[0] recs['Title'] = u(cleanup_title(title)) # When the book is part of a bigger opus, the main title appears in 461, not in 200 # 461 1$1001IT\ICCU\UBO\0079398$12001 $aIstituzioni di diritto romano$fEnzo Nardi$v1$ elif re.search(r"^461", line): mtitle = re.findall(r'\$a(.+?)\$f', line)[0] recs['Title'] = u(cleanup_title(mtitle) + '. ' + recs['Title']) # Sometimes there is no author in 70X, but in 461: # 461 1$1001IT\ICCU\CFI\0053061$12001 $aCommedia$fDante Alighieri$ga cura di Emilio Pasquini e Antonio Quaglio$v1$1700 1$aAlighieri$b, Dante$3IT\ICCU\CFIV\008732$4070$1702 1$aPasquini$b, Emilio$f <1935- >$3IT\ICCU\CFIV\011735$1702 1$aQuaglio$b, Antonio Enzo$3IT\ICCU\CFIV\033998 if (len(recs['Authors']) == 0 and re.search(r"700 1\$a", line)): surname = re.findall(r'1\$a(.+?)\$b', line) name = re.findall(r'\$b(.+?)\$', line) for s, n in zip(surname, name): recs['Authors'].append(u(s + n)) # Language: # <li>101 $aita</li> # Sometimes there are two main languages: 101 $alat$aita elif re.search(r"^101", line): langs = re.findall(r'\$a\D\D\D', line) lang = '' for l in langs: lang = l if l == langs[0] else lang + ',' + l lang = lang.replace('$a', '') recs['Language'] = u(lang) elif line == '': continue except IndexError: LOGGER.debug('Check the parsing for Italian SBN (possible error!)') try: # delete almost empty records if not recs['Title'] and not recs['Authors']: recs = {} except KeyError: recs = {} return recs
""" nose tests """ WINDOWS = os.name == 'nt' TESTFILE = './a-deleteme.pdf' NEW_BASENAME = 'a-deleteme-PLEASE.pdf' F1 = '9780321534965.pdf' F2 = '9781597499644.pdf' #F3 = '9781852330729.pdf' #F4 = '9787500117018.pdf' F5 = '9789727576807.pdf' F6 = u('Campos2011_Emergências obstétricas_9789727576807.pdf') F7 = u('Knuth2008_The Art Of Computer Programming_9780321534965.pdf') #F8 = u('Man2001_Genetic Algorithms Concepts And Designs_9781852330729.pdf') F9 = u( "O'Connor2012_Violent Python A Cookbook for Hackers, Forensic Analysts, Penetra_9781597499644.pdf" ) #F10 = u('海明威2007_Lao ren yu hai_9787500117018.pdf') F11 = 'myfile.pdf' #FISBN = [F1, F2, F3, F4, F5] FISBN = [F1, F2, F5] #FFT = [F6, F7, F8, F9, F10] FFT = [F6, F7, F9] FILES = FISBN + FFT + [F11]
renfile, ) from nose.tools import assert_equals WINDOWS = os.name == 'nt' TESTFILE = './a-deleteme.pdf' NEW_BASENAME = 'a-deleteme-PLEASE.pdf' F1 = '9780321534965.pdf' F2 = '9780743258074.pdf' # F3 = '9781852330729.pdf' # F4 = '9787500117018.pdf' F5 = '9789727576807.pdf' F6 = u('Campos2011_Emergências obstétricas_9789727576807.pdf') F7 = u('Knuth2008_The Art Of Computer Programming_9780321534965.pdf') # F8 = u('Man2001_Genetic Algorithms Concepts And Designs_9781852330729.pdf') F9 = u('Isaacson2004_Benjamin Franklin_9780743258074.pdf') # F10 = u('海明威2007_Lao ren yu hai_9787500117018.pdf') F11 = 'myfile.pdf' # FISBN = [F1, F2, F3, F4, F5] # FISBN = [F1, F2, F5] FISBN = [F1, F2] # FFT = [F6, F7, F8, F9, F10] # FFT = [F6, F7, F9] FFT = [F7, F9] FILES = FISBN + FFT + [F11]