def metadata(self): mi = Metadata(_('Unknown')) dp_name, ap_name = self.get_document_properties_names() if dp_name: try: raw = self.read(dp_name) except KeyError: pass else: read_doc_props(raw, mi, self.namespace.XPath) if mi.is_null('language'): try: raw = self.read('word/styles.xml') except KeyError: pass else: read_default_style_language(raw, mi, self.namespace.XPath) ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None) if ap_name: try: raw = self.read(ap_name) except KeyError: pass else: read_app_props(raw, mi) return mi
def get_metadata(book_id): oldmi = db.get_metadata(book_id, index_is_id=True, get_cover=True, cover_as_data=True) opf, cov = id_map[book_id] if opf is None: newmi = Metadata(oldmi.title, authors=tuple(oldmi.authors)) else: with open(opf, 'rb') as f: newmi = OPF(f, basedir=os.path.dirname(opf), populate_spine=False).to_book_metadata() newmi.cover, newmi.cover_data = None, (None, None) for x in ('title', 'authors'): if newmi.is_null(x): # Title and author are set to null if they are # the same as the originals as an optimization, # we undo that, as it is confusing. newmi.set(x, copy.copy(oldmi.get(x))) if cov: with open(cov, 'rb') as f: newmi.cover_data = ('jpg', f.read()) return oldmi, newmi
def metadata(self): mi = Metadata(_('Unknown')) name = self.relationships.get(DOCPROPS, None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') if names: name = names[0] if name: try: raw = self.read(name) except KeyError: pass else: read_doc_props(raw, mi) if mi.is_null('language'): try: raw = self.read('word/styles.xml') except KeyError: pass else: read_default_style_language(raw, mi) name = self.relationships.get(APPPROPS, None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') if names: name = names[0] if name: try: raw = self.read(name) except KeyError: pass else: read_app_props(raw, mi) return mi
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans