def query_service (self, index, value, results): """ A generalised query for ISBNdb. :Parameters: index : string The index to search in ISBNdb. value : string The value to search for in the index.. results : iterable A list of the data to include in the response. :Returns: The response received from the service. This serves a general way of accessing all the methods available for ISBNdb. It also normalises the ISBN to a suitable form for submission. Note that it is probably possible to form a bad query with the wrong combination of parameters. """ ## Preconditions & preparation: if (index == 'isbn'): value = utils.normalize_isbn (value) ## Main: sub_url = 'index1=%(indx)s&value1=%(val)s' % { 'indx': index, 'val': value, } if (results): res_str = ','.join (list(results)) sub_url += '&results=' + res_str return self.request (sub_url)
def query_service (self, isbn, method, format, fields=['*']): """ A generalised query for xISBN. :Parameters: isbn : string A normalised ISBN-10 or -13. method : string The request type to make of xISBN. format : string The form for the response. fields : iterable A list of the fields to include in the response. :Returns: The response received from the service. This serves a general way of accessing all the methods available for xISBN. It also normalises the ISBn to a suitable form for submission. """ ## Preconditions & preparation: assert (format in FORMATS), \ "unrecognised format '%s', must be one of %s" % (format, FORMATS) ## Main: sub_url = "%(isbn)s?method=%(mthd)s&format=%(fmt)s&fl=%(flds)s" % { 'mthd': method, 'fmt': format, 'isbn': utils.normalize_isbn (isbn), 'flds': ','.join (fields), } return self.request (sub_url)
def info(epub): data = {} try: z = zipfile.ZipFile(epub) except zipfile.BadZipFile: logger.debug('invalid epub file %s', epub) return data opf = [f.filename for f in z.filelist if f.filename.endswith('opf')] if opf: info = ET.fromstring(z.read(opf[0])) metadata = info.findall('{http://www.idpf.org/2007/opf}metadata') if metadata: metadata = metadata[0] for e in metadata.getchildren(): if e.text and e.text.strip() and e.text not in ('unknown', 'none'): key = e.tag.split('}')[-1] key = { 'creator': 'author', }.get(key, key) value = e.text.strip() if key == 'identifier': value = normalize_isbn(value) if stdnum.isbn.is_valid(value): data['isbn'] = [value] elif key == 'author': data[key] = value.split(', ') else: data[key] = value if 'description' in data: data['description'] = strip_tags(decode_html(data['description'])) text = extract_text(epub) data['textsize'] = len(text) if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'date' in data and 'T' in data['date']: data['date'] = data['date'].split('T')[0] if 'language' in data and isinstance(data['language'], str): data['language'] = get_language(data['language']) return data
def getMetadata(data): ''' takes { key: value includeEdits: boolean } key can be one of the supported identifiers: isbn, oclc, olid,... ''' logger.debug('getMetadata %s', data) if 'includeEdits' in data: include_edits = data.pop('includeEdits') else: include_edits = False key, value = next(iter(data.items())) if key == 'isbn': value = utils.normalize_isbn(value) logger.debug('getMetadata key=%s value=%s', key, value) response = meta.lookup(key, value) if response: response['primaryid'] = [key, value] return response
def info(opf): data = {} try: with open(opf, 'rb') as fd: opf = ET.fromstring(fd.read().decode()) except: logger.debug('failed to load opf %s', opf, exc_info=1) return data ns = '{http://www.idpf.org/2007/opf}' metadata = opf.findall(ns + 'metadata')[0] for e in metadata.getchildren(): if e.text: key = e.tag.split('}')[-1] key = { 'creator': 'author', }.get(key, key) value = e.text if key == 'identifier': isbn = normalize_isbn(value) if stdnum.isbn.is_valid(isbn): if not 'isbn' in data: data['isbn'] = [isbn] else: data['isbn'].append(isbn) if e.attrib.get(ns + 'scheme') == 'AMAZON': if not 'asin' in data: data['asin'] = [value] else: data['asin'].append(value) else: data[key] = strip_tags(e.text) #YYY-MM-DD if 'date' in data and len(data['date']) > 10: data['date'] =data['date'][:10] if 'language' in data: data['language'] = get_language(data['language']) return data
def info(opf): data = {} try: with open(opf, 'rb') as fd: opf = ET.fromstring(fd.read().decode()) except: logger.debug('failed to load opf %s', opf, exc_info=1) return data ns = '{http://www.idpf.org/2007/opf}' metadata = opf.findall(ns + 'metadata')[0] for e in metadata.getchildren(): if e.text: key = e.tag.split('}')[-1] key = { 'creator': 'author', }.get(key, key) value = e.text if key == 'identifier': isbn = normalize_isbn(value) if stdnum.isbn.is_valid(isbn): if not 'isbn' in data: data['isbn'] = [isbn] else: data['isbn'].append(isbn) if e.attrib.get(ns + 'scheme') == 'AMAZON': if not 'asin' in data: data['asin'] = [value] else: data['asin'].append(value) else: data[key] = strip_tags(e.text) #YYY-MM-DD if 'date' in data and len(data['date']) > 10: data['date'] = data['date'][:10] if 'language' in data: data['language'] = get_language(data['language']) return data
def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') info = pdfreader.getDocumentInfo() if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass xmp = pdfreader.getXmpMetadata() if xmp: for key in dir(xmp): if key.startswith('dc_'): value = getattr(xmp, key) if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [ v.strip() if isinstance(v, str) else v for v in value if v ] value = [ v.strftime('%Y-%m-%d') if isinstance( v, datetime) else v for v in value ] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=1) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = normalize_isbn(data['identifier']) if stdnum.isbn.is_valid(value): data['isbn'] = [value] del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown', ): del data[key] if key == 'language': data[key] = get_language(data[key]) text = extract_text(pdf) data['textsize'] = len(text) if settings.server['extract_text']: if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'isbn' in data and isinstance(data['isbn'], str): data['isbn'] = [data['isbn']] if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = data['author'].split(', ') return data
def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') info = pdfreader.getDocumentInfo() if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass xmp = pdfreader.getXmpMetadata() if xmp: for key in dir(xmp): if key.startswith('dc_'): value = getattr(xmp, key) if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [v.strip() if isinstance(v, str) else v for v in value if v] value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=1) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = normalize_isbn(data['identifier']) if stdnum.isbn.is_valid(value): data['isbn'] = [value] del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown',): del data[key] if key == 'language': data[key] = get_language(data[key]) text = extract_text(pdf) data['textsize'] = len(text) if settings.server['extract_text']: if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'isbn' in data and isinstance(data['isbn'], str): data['isbn'] = [data['isbn']] if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = data['author'].split(', ') return data