Beispiel #1
0
	def query_service (self, index, value, results):
		"""
		A generalised query for ISBNdb.
		
		:Parameters:
			index : string
				The index to search in ISBNdb.
			value : string
				The value to search for in the index..
			results : iterable
				A list of the data to include in the response.
		
		:Returns:
			The response received from the service.
		
		This serves a general way of accessing all the methods available for
		ISBNdb. It also normalises the ISBN to a suitable form for submission.
		Note that it is probably possible to form a bad query with the wrong
		combination of parameters.
		
		"""
		## Preconditions & preparation:
		if (index == 'isbn'):
			value = utils.normalize_isbn (value)
		## Main:
		sub_url = 'index1=%(indx)s&value1=%(val)s' % {
			'indx': index,
			'val': value,
		}
		if (results):
			res_str = ','.join (list(results))
			sub_url += '&results=' + res_str
		return self.request (sub_url)
Beispiel #2
0
	def query_service (self, isbn, method, format, fields=['*']):
		"""
		A generalised query for xISBN.
		
		:Parameters:
			isbn : string
				A normalised ISBN-10 or -13.
			method : string
				The request type to make of xISBN.
			format : string
				The form for the response.
			fields : iterable
				A list of the fields to include in the response.
		
		:Returns:
			The response received from the service.
		
		This serves a general way of accessing all the methods available for
		xISBN. It also normalises the ISBn to a suitable form for submission.
		
		"""
		## Preconditions & preparation:
		assert (format in FORMATS), \
			"unrecognised format '%s', must be one of %s" % (format, FORMATS)
		## Main:
		sub_url = "%(isbn)s?method=%(mthd)s&format=%(fmt)s&fl=%(flds)s" % {
			'mthd': method,
			'fmt': format,
			'isbn': utils.normalize_isbn (isbn),
			'flds': ','.join (fields),
		}
		return self.request (sub_url)
Beispiel #3
0
def info(epub):
    data = {}
    try:
        z = zipfile.ZipFile(epub)
    except zipfile.BadZipFile:
        logger.debug('invalid epub file %s', epub)
        return data
    opf = [f.filename for f in z.filelist if f.filename.endswith('opf')]
    if opf:
        info = ET.fromstring(z.read(opf[0]))
        metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
        if metadata:
            metadata = metadata[0]
            for e in metadata.getchildren():
                if e.text and e.text.strip() and e.text not in ('unknown', 'none'):
                    key = e.tag.split('}')[-1]
                    key = {
                        'creator': 'author',
                    }.get(key, key)
                    value = e.text.strip()
                    if key == 'identifier':
                        value = normalize_isbn(value)
                        if stdnum.isbn.is_valid(value):
                            data['isbn'] = [value]
                    elif key == 'author':
                        data[key] = value.split(', ')
                    else:
                        data[key] = value
    if 'description' in data:
        data['description'] = strip_tags(decode_html(data['description']))
    text = extract_text(epub)
    data['textsize'] = len(text)
    if not 'isbn' in data:
        isbn = extract_isbn(text)
        if isbn:
            data['isbn'] = [isbn]
    if 'date' in data and 'T' in data['date']:
        data['date'] = data['date'].split('T')[0]
    if 'language' in data and isinstance(data['language'], str):
        data['language'] = get_language(data['language'])
    return data
Beispiel #4
0
def getMetadata(data):
    '''
        takes {
            key: value
            includeEdits: boolean
        }
        key can be one of the supported identifiers: isbn, oclc, olid,...
    '''
    logger.debug('getMetadata %s', data)
    if 'includeEdits' in data:
        include_edits = data.pop('includeEdits')
    else:
        include_edits = False
    key, value = next(iter(data.items()))
    if key == 'isbn':
        value = utils.normalize_isbn(value)
    logger.debug('getMetadata key=%s value=%s', key, value)
    response = meta.lookup(key, value)
    if response:
        response['primaryid'] = [key, value]
    return response
def getMetadata(data):
    '''
        takes {
            key: value
            includeEdits: boolean
        }
        key can be one of the supported identifiers: isbn, oclc, olid,...
    '''
    logger.debug('getMetadata %s', data)
    if 'includeEdits' in data:
        include_edits = data.pop('includeEdits')
    else:
        include_edits = False
    key, value = next(iter(data.items()))
    if key == 'isbn':
        value = utils.normalize_isbn(value)
    logger.debug('getMetadata key=%s value=%s', key, value)
    response = meta.lookup(key, value)
    if response:
        response['primaryid'] = [key, value]
    return response
Beispiel #6
0
def info(opf):
    data = {}
    try:
        with open(opf, 'rb') as fd:
            opf = ET.fromstring(fd.read().decode())
    except:
        logger.debug('failed to load opf %s', opf, exc_info=1)
        return data
    ns = '{http://www.idpf.org/2007/opf}'
    metadata = opf.findall(ns + 'metadata')[0]
    for e in metadata.getchildren():
        if e.text:
            key = e.tag.split('}')[-1]
            key = {
                'creator': 'author',
            }.get(key, key)
            value = e.text
            if key == 'identifier':
                isbn = normalize_isbn(value)
                if stdnum.isbn.is_valid(isbn):
                    if not 'isbn' in data:
                        data['isbn'] = [isbn]
                    else:
                        data['isbn'].append(isbn)
                if e.attrib.get(ns + 'scheme') == 'AMAZON':
                    if not 'asin' in data:
                        data['asin'] = [value]
                    else:
                        data['asin'].append(value)
            else:
                data[key] = strip_tags(e.text)
    #YYY-MM-DD
    if 'date' in data and len(data['date']) > 10:
        data['date'] =data['date'][:10]
    if 'language' in data:
        data['language'] = get_language(data['language'])
    return data
Beispiel #7
0
def info(opf):
    data = {}
    try:
        with open(opf, 'rb') as fd:
            opf = ET.fromstring(fd.read().decode())
    except:
        logger.debug('failed to load opf %s', opf, exc_info=1)
        return data
    ns = '{http://www.idpf.org/2007/opf}'
    metadata = opf.findall(ns + 'metadata')[0]
    for e in metadata.getchildren():
        if e.text:
            key = e.tag.split('}')[-1]
            key = {
                'creator': 'author',
            }.get(key, key)
            value = e.text
            if key == 'identifier':
                isbn = normalize_isbn(value)
                if stdnum.isbn.is_valid(isbn):
                    if not 'isbn' in data:
                        data['isbn'] = [isbn]
                    else:
                        data['isbn'].append(isbn)
                if e.attrib.get(ns + 'scheme') == 'AMAZON':
                    if not 'asin' in data:
                        data['asin'] = [value]
                    else:
                        data['asin'].append(value)
            else:
                data[key] = strip_tags(e.text)
    #YYY-MM-DD
    if 'date' in data and len(data['date']) > 10:
        data['date'] = data['date'][:10]
    if 'language' in data:
        data['language'] = get_language(data['language'])
    return data
Beispiel #8
0
def info(pdf):
    data = {}
    with open(pdf, 'rb') as fd:
        try:
            pdfreader = PdfFileReader(fd)
            data['pages'] = pdfreader.numPages
            if pdfreader.getIsEncrypted():
                pdfreader.decrypt('')
            info = pdfreader.getDocumentInfo()
            if info:
                for key in info:
                    if info[key]:
                        try:
                            value = info[key]
                            if len(value) == 1:
                                value = value[0]
                            if isinstance(value, bytes):
                                value = value.decode('utf-16')
                            data[key[1:].lower()] = value
                        except:
                            pass

            xmp = pdfreader.getXmpMetadata()
            if xmp:
                for key in dir(xmp):
                    if key.startswith('dc_'):
                        value = getattr(xmp, key)
                        if isinstance(value, dict) and 'x-default' in value:
                            value = value['x-default']
                        elif isinstance(value, list):
                            value = [
                                v.strip() if isinstance(v, str) else v
                                for v in value if v
                            ]
                            value = [
                                v.strftime('%Y-%m-%d') if isinstance(
                                    v, datetime) else v for v in value
                            ]
                            if len(value) == 1:
                                value = value[0]
                        _key = key[3:]
                        if value and _key not in data:
                            data[_key] = value
        except:
            logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
    '''
    cmd = ['pdfinfo', pdf]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
    stdout, stderr = p.communicate()
    for line in stdout.strip().split('\n'):
        parts = line.split(':')
        key = parts[0].lower().strip()
        if key:
            data[key] = ':'.join(parts[1:]).strip()
    for key in data.keys():
        if not data[key]:
            del data[key]
    '''
    if 'identifier' in data:
        value = normalize_isbn(data['identifier'])
        if stdnum.isbn.is_valid(value):
            data['isbn'] = [value]
            del data['identifier']
    for key, value in data.items():
        if isinstance(value, dict):
            value = ' '.join(list(value.values()))
            data[key] = value.strip()
    for key in list(data):
        if data[key] in ('Unknown', ):
            del data[key]
        if key == 'language':
            data[key] = get_language(data[key])
    text = extract_text(pdf)
    data['textsize'] = len(text)
    if settings.server['extract_text']:
        if not 'isbn' in data:
            isbn = extract_isbn(text)
            if isbn:
                data['isbn'] = [isbn]
    if 'isbn' in data and isinstance(data['isbn'], str):
        data['isbn'] = [data['isbn']]
    if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
        d = data['date']
        data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
    if 'author' in data and isinstance(data['author'], str):
        data['author'] = data['author'].split(', ')
    return data
Beispiel #9
0
def info(pdf):
    data = {}
    with open(pdf, 'rb') as fd:
        try:
            pdfreader = PdfFileReader(fd)
            data['pages'] = pdfreader.numPages
            if pdfreader.getIsEncrypted():
                pdfreader.decrypt('')
            info = pdfreader.getDocumentInfo()
            if info:
                for key in info:
                    if info[key]:
                        try:
                            value = info[key]
                            if len(value) == 1:
                                value = value[0]
                            if isinstance(value, bytes):
                                value = value.decode('utf-16')
                            data[key[1:].lower()] = value
                        except:
                            pass

            xmp = pdfreader.getXmpMetadata()
            if xmp:
                for key in dir(xmp):
                    if key.startswith('dc_'):
                        value = getattr(xmp, key)
                        if isinstance(value, dict) and 'x-default' in value:
                            value = value['x-default']
                        elif isinstance(value, list):
                            value = [v.strip() if isinstance(v, str) else v for v in value if v]
                            value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
                            if len(value) == 1:
                                value = value[0]
                        _key = key[3:]
                        if value and _key not in data:
                            data[_key] = value
        except:
            logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)

    '''
    cmd = ['pdfinfo', pdf]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
    stdout, stderr = p.communicate()
    for line in stdout.strip().split('\n'):
        parts = line.split(':')
        key = parts[0].lower().strip()
        if key:
            data[key] = ':'.join(parts[1:]).strip()
    for key in data.keys():
        if not data[key]:
            del data[key]
    '''
    if 'identifier' in data:
        value = normalize_isbn(data['identifier'])
        if stdnum.isbn.is_valid(value):
            data['isbn'] = [value]
            del data['identifier']
    for key, value in data.items():
        if isinstance(value, dict):
            value = ' '.join(list(value.values()))
            data[key] = value.strip()
    for key in list(data):
        if data[key] in ('Unknown',):
            del data[key]
        if key == 'language':
            data[key] = get_language(data[key])
    text = extract_text(pdf)
    data['textsize'] = len(text)
    if settings.server['extract_text']:
        if not 'isbn' in data:
            isbn = extract_isbn(text)
            if isbn:
                data['isbn'] = [isbn]
    if 'isbn' in data and isinstance(data['isbn'], str):
        data['isbn'] = [data['isbn']]
    if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
        d = data['date']
        data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
    if 'author' in data and isinstance(data['author'], str):
        data['author'] = data['author'].split(', ')
    return data