def importbib(db, bibfile, tags=[], overwrite=False): errors = [] sources = Sources() for entry in sorted(Bibtex(bibfile), key=lambda entry: entry.key): print >> sys.stderr, entry.key try: docs = [] # check for doc with this bibkey bdoc = db.doc_for_bib(entry.key) if bdoc: docs.append(bdoc) # check for known sids for source in sources.scan_bibentry(entry): sdoc = db.doc_for_source(source.sid) # FIXME: why can't we match docs in list? if sdoc and sdoc.docid not in [doc.docid for doc in docs]: docs.append(sdoc) if len(docs) == 0: doc = Document(db) elif len(docs) > 0: if len(docs) > 1: print >> sys.stderr, " Multiple distinct docs found for entry. Using first found." doc = docs[0] print >> sys.stderr, " Updating id:%d..." % (doc.docid) doc.add_bibentry(entry) filepath = entry.get_file() if filepath: print >> sys.stderr, " Adding file: %s" % filepath doc.add_file(filepath) doc.add_tags(tags) doc.sync() except BibtexError as e: print >> sys.stderr, " Error processing entry %s: %s" % ( entry.key, e) print >> sys.stderr errors.append(entry.key) if errors: print >> sys.stderr print >> sys.stderr, "Failed to import %d" % (len(errors)), if len(errors) == 1: print >> sys.stderr, "entry", else: print >> sys.stderr, "entries", print >> sys.stderr, "from bibtex:" for error in errors: print >> sys.stderr, " %s" % (error) sys.exit(1) else: sys.exit(0)
class Document(): """Represents a single Xapers document.""" def __init__(self, db, xapian_doc=None, docid=None): # Xapers db self.db = db # if Xapian doc provided, initiate for that document if xapian_doc: self.xapian_doc = xapian_doc self.docid = xapian_doc.get_docid() # else, create a new empty document # document won't be added to database until sync is called else: self.xapian_doc = xapian.Document() # use specified docid if provided if docid: if docid in self.db: raise DocumentError('Document already exists for id %d.' % docid) self.docid = docid else: self.docid = self.db._generate_docid() self._add_term(self.db._find_prefix('id'), self.docid) # specify a directory in the Xapers root for document data self.docdir = os.path.join(self.db.root, '%010d' % self.docid) self.bibentry = None self._infiles = {} def get_docid(self): """Return document id of document.""" return self.docid def _make_docdir(self): if os.path.exists(self.docdir): if not os.path.isdir(self.docdir): raise DocumentError( 'File exists at intended docdir location: %s' % self.docdir) else: os.makedirs(self.docdir) def _write_files(self): for name, data in self._infiles.iteritems(): path = os.path.join(self.docdir, name) with open(path, 'w') as f: f.write(data) def _write_bibfile(self): bibpath = self.get_bibpath() # reload bibtex only if we have new files paths = self.get_fullpaths() if paths: self._load_bib() if self.bibentry: # we put only the first file in the bibtex # FIXME: does jabref/mendeley spec allow for multiple files? if paths and not self.bibentry.get_file(): self.bibentry.set_file(paths[0]) self.bibentry.to_file(bibpath) def _write_tagfile(self): with open(os.path.join(self.docdir, 'tags'), 'w') as f: for tag in self.get_tags(): f.write(tag) f.write('\n') def _rm_docdir(self): if os.path.exists(self.docdir) and os.path.isdir(self.docdir): shutil.rmtree(self.docdir) def sync(self): """Sync document to database.""" # FIXME: add value for modification time # FIXME: catch db not writable errors try: self._make_docdir() self._write_files() self._write_bibfile() self._write_tagfile() self.db.replace_document(self.docid, self.xapian_doc) except: self._rm_docdir() raise def purge(self): """Purge document from database and root.""" # FIXME: catch db not writable errors try: self.db.delete_document(self.docid) except xapian.DocNotFoundError: pass self._rm_docdir() self.docid = None ######################################## # internal stuff # add an individual prefix'd term for the document def _add_term(self, prefix, value): term = '%s%s' % (prefix, value) self.xapian_doc.add_term(term) # remove an individual prefix'd term for the document def _remove_term(self, prefix, value): term = '%s%s' % (prefix, value) try: self.xapian_doc.remove_term(term) except xapian.InvalidArgumentError: pass # Parse 'text' and add a term to 'message' for each parsed # word. Each term will be added both prefixed (if prefix is not # None) and non-prefixed. # http://xapian.org/docs/bindings/python/ # http://xapian.org/docs/quickstart.html # http://www.flax.co.uk/blog/2009/04/02/xapian-search-architecture/ def _gen_terms(self, prefix, text): term_gen = self.db.term_gen term_gen.set_document(self.xapian_doc) if prefix: term_gen.index_text(text, 1, prefix) term_gen.index_text(text) # return a list of terms for prefix def _term_iter(self, prefix=None): term_iter = iter(self.xapian_doc) if prefix: plen = len(prefix) term = term_iter.skip_to(prefix) if not term.term.startswith(prefix): return yield term.term[plen:] for term in term_iter: if prefix: if not term.term.startswith(prefix): break yield term.term[plen:] else: yield term.term def term_iter(self, name=None): """Iterator over all terms in the document. If a prefix is provided, will iterate over only the prefixed terms, and the prefix will be removed from the returned terms. """ prefix = None if name: prefix = self.db._find_prefix(name) if not prefix: prefix = name return self._term_iter(prefix) # set the data object for the document def _set_data(self, text): self.xapian_doc.set_data(text) def get_data(self): """Get data object for document.""" return self.xapian_doc.get_data() ######################################## # files def add_file_data(self, name, data): """Add a file data to document. 'name' is the name of the file, 'data is the file data. File will not copied in to docdir until sync(). """ # FIXME: set mime type term # parse the file data into text text = parse_data(data) # generate terms from the text self._gen_terms(None, text) # set data to be text sample # FIXME: is this the right thing to put in the data? summary = text[0:997] + '...' self._set_data(summary) # FIXME: should files be renamed to something generic (0.pdf)? prefix = self.db._find_prefix('file') self._add_term(prefix, name) # add it to the cache to be written at sync() self._infiles[name] = data def add_file(self, infile): """Add a file to document. Added file will have the same name. File will not copied in to docdir until sync(). """ with open(infile, 'r') as f: data = f.read() name = os.path.basename(infile) self.add_file_data(name, data) def get_files(self): """Return files associated with document.""" return list(self.term_iter('file')) def get_fullpaths(self): """Return fullpaths of files associated with document.""" list = [] for path in self.get_files(): # FIXME: this is a hack for old path specifications that # included the docdir path = os.path.basename(path) list.append(os.path.join(self.docdir, path)) return list ######################################## # SOURCES def _purge_sources_prefix(self, source): # purge all terms for a given source prefix prefix = self.db._make_source_prefix(source) for i in self._term_iter(prefix): self._remove_term(prefix, i) self._remove_term(self.db._find_prefix('source'), source) def add_sid(self, sid): """Add source sid to document.""" source, oid = sid.split(':', 1) source = source.lower() # remove any existing terms for this source self._purge_sources_prefix(source) # add a term for the source self._add_term(self.db._find_prefix('source'), source) # add a term for the sid, with source as prefix self._add_term(self.db._make_source_prefix(source), oid) def get_sids(self): """Return a list of sids for document.""" sids = [] for source in self.term_iter('source'): for oid in self._term_iter(self.db._make_source_prefix(source)): sids.append('%s:%s' % (source, oid)) return sids # TAGS def add_tags(self, tags): """Add tags from list to document.""" prefix = self.db._find_prefix('tag') for tag in tags: self._add_term(prefix, tag) def get_tags(self): """Return a list of tags associated with document.""" return list(self.term_iter('tag')) def remove_tags(self, tags): """Remove tags from a document.""" prefix = self.db._find_prefix('tag') for tag in tags: self._remove_term(prefix, tag) # TITLE def _set_title(self, title): pt = self.db._find_prefix('title') for term in self._term_iter(pt): self._remove_term(pt, term) # FIXME: what's the clean way to get these prefixes? for term in self._term_iter('ZS'): self._remove_term('ZS', term) self._gen_terms(pt, title) # AUTHOR def _set_authors(self, authors): pa = self.db._find_prefix('author') for term in self._term_iter(pa): self._remove_term(pa, term) # FIXME: what's the clean way to get these prefixes? for term in self._term_iter('ZA'): self._remove_term('ZA', term) self._gen_terms(pa, authors) # YEAR def _set_year(self, year): # FIXME: what to do if year is not an int? try: year = int(year) except ValueError: pass prefix = self.db._find_prefix('year') for term in self._term_iter(prefix): self._remove_term(prefix, year) self._add_term(prefix, year) facet = self.db._find_facet('year') self.xapian_doc.add_value(facet, xapian.sortable_serialise(year)) ######################################## # bibtex def get_bibpath(self): """Return path to document bibtex file.""" return os.path.join(self.docdir, 'bibtex') def _set_bibkey(self, key): prefix = self.db._find_prefix('key') for term in self._term_iter(prefix): self._remove_term(prefix, term) self._add_term(prefix, key) def _index_bibentry(self, bibentry): authors = bibentry.get_authors() fields = bibentry.get_fields() if 'title' in fields: self._set_title(fields['title']) if 'year' in fields: self._set_year(fields['year']) if authors: # authors should be a list, so we make a single text string # FIXME: better way to do this? self._set_authors(' '.join(authors)) # add any sources in the bibtex for source in Sources().scan_bibentry(bibentry): self.add_sid(source.sid) # FIXME: index 'keywords' field as regular terms self._set_bibkey(bibentry.key) def add_bibentry(self, bibentry): """Add bibentry object.""" self.bibentry = bibentry self._index_bibentry(self.bibentry) def add_bibtex(self, bibtex): """Add bibtex to document, as string or file path.""" self.add_bibentry(Bibtex(bibtex)[0]) def _load_bib(self): if self.bibentry: return bibpath = self.get_bibpath() if os.path.exists(bibpath): self.bibentry = Bibtex(bibpath)[0] def get_bibtex(self): """Get the bib for document as a bibtex string.""" bibpath = self.get_bibpath() if os.path.exists(bibpath): with open(bibpath, 'r') as f: bibtex = f.read().decode('utf-8') return bibtex.strip() def get_bibdata(self): self._load_bib() if self.bibentry: data = self.bibentry.get_fields() data['authors'] = self.bibentry.get_authors() return data def update_from_bibtex(self): """Update document metadata from document bibtex.""" self._load_bib() self._index_bibentry(self.bibentry) ######################################## def get_key(self): self._load_bib() if not self.bibentry: return return self.bibentry.key def get_title(self): """Get the title from document bibtex.""" self._load_bib() if not self.bibentry: return fields = self.bibentry.get_fields() if 'title' in fields: return fields['title'] def get_year(self): """Get the title from document bibtex.""" self._load_bib() if not self.bibentry: return fields = self.bibentry.get_fields() if 'year' in fields: return fields['year'] def get_urls(self): """Get all URLs associated with document.""" sources = Sources() urls = [] # get urls associated with known sources for sid in self.get_sids(): urls.append(sources[sid].url) # get urls from bibtex self._load_bib() if self.bibentry: fields = self.bibentry.get_fields() if 'url' in fields: urls.append(fields['url']) if 'adsurl' in fields: urls.append(fields['adsurl']) return urls
def _load_bib(self): if self.bibentry: return bibpath = self.get_bibpath() if os.path.exists(bibpath): self.bibentry = Bibtex(bibpath)[0]
def add_bibtex(self, bibtex): """Add bibtex to document, as string or file path.""" self.add_bibentry(Bibtex(bibtex)[0])
class Document(): """Represents a single Xapers document.""" def __init__(self, db, doc=None, docid=None): # Xapers db self.db = db self.root = self.db.root # if Xapian doc provided, initiate for that document if doc: self.doc = doc self.docid = str(doc.get_docid()) # else, create a new empty document # document won't be added to database until sync is called else: self.doc = xapian.Document() # use specified docid if provided if docid: if self.db[docid]: raise DocumentError('Document already exists for id %s.' % docid) self.docid = docid else: self.docid = str(self.db._generate_docid()) self._add_term(self.db._find_prefix('id'), self.docid) # specify a directory in the Xapers root for document data self.docdir = os.path.join(self.root, '%010d' % int(self.docid)) # self.bibentry = None def get_docid(self): """Return document id of document.""" return self.docid def _make_docdir(self): if os.path.exists(self.docdir): if not os.path.isdir(self.docdir): raise DocumentError('File exists at intended docdir location: %s' % self.docdir) else: os.makedirs(self.docdir) def _write_files(self): if '_infiles' in dir(self): for infile, outfile in self._infiles.iteritems(): try: shutil.copyfile(infile, outfile) except shutil.Error: pass def _write_bibfile(self): bibpath = self.get_bibpath() # reload bibtex only if we have new files paths = self.get_fullpaths() if paths: self._load_bib() if self.bibentry: # we put only the first file in the bibtex # FIXME: does jabref/mendeley spec allow for multiple files? if paths and not self.bibentry.get_file(): self.bibentry.set_file(paths[0]) self.bibentry.to_file(bibpath) def _write_tagfile(self): with open(os.path.join(self.docdir, 'tags'), 'w') as f: for tag in self.get_tags(): f.write(tag) f.write('\n') def _rm_docdir(self): if os.path.exists(self.docdir) and os.path.isdir(self.docdir): shutil.rmtree(self.docdir) def sync(self): """Sync document to database.""" # FIXME: add value for modification time # FIXME: catch db not writable errors try: self._make_docdir() self._write_files() self._write_bibfile() self._write_tagfile() self.db.replace_document(self.docid, self.doc) except: self._rm_docdir() raise def purge(self): """Purge document from database and root.""" # FIXME: catch db not writable errors try: self.db.delete_document(self.docid) except xapian.DocNotFoundError: pass self._rm_docdir() self.docid = None ######################################## # internal stuff # add an individual prefix'd term for the document def _add_term(self, prefix, value): term = '%s%s' % (prefix, value) self.doc.add_term(term) # remove an individual prefix'd term for the document def _remove_term(self, prefix, value): term = '%s%s' % (prefix, value) try: self.doc.remove_term(term) except xapian.InvalidArgumentError: pass # Parse 'text' and add a term to 'message' for each parsed # word. Each term will be added both prefixed (if prefix_name is # not NULL) and also non-prefixed). # http://xapian.org/docs/bindings/python/ # http://xapian.org/docs/quickstart.html # http://www.flax.co.uk/blog/2009/04/02/xapian-search-architecture/ def _gen_terms(self, prefix, text): term_gen = self.db.term_gen term_gen.set_document(self.doc) if prefix: term_gen.index_text(text, 1, prefix) term_gen.index_text(text) # return a list of terms for prefix # FIXME: is this the fastest way to do this? def _get_terms(self, prefix): list = [] for term in self.doc: if term.term.find(prefix.encode("utf-8")) == 0: index = len(prefix) list.append(term.term[index:]) return list # set the data object for the document def _set_data(self, text): self.doc.set_data(text) def get_data(self): """Get data object for document.""" return self.doc.get_data() ######################################## # files # index file for the document def _index_file(self, path): text = parse_file(path) self._gen_terms(None, text) summary = text[0:997].translate(None, '\n') + '...' return summary def _add_path(self, path): base, full = self.db._basename_for_path(path) prefix = self.db._find_prefix('file') self._add_term(prefix, base) def _get_paths(self): return self._get_terms(self.db._find_prefix('file')) def get_fullpaths(self): """Return fullpaths associated with document.""" list = [] for path in self._get_paths(): # FIXME: this is a hack for old bad path specifications and should be removed if path.find(self.root) == 0: index = len(self.root) + 1 path = path[index:] path = path.lstrip('/') # FIXME base, full = self.db._basename_for_path(path) list.append(full) return list def add_file(self, infile): """Add a file to document. File will not copied in to docdir until sync().""" # FIXME: should load entire file into {name: file} to be # written as file>docdir/name # FIXME: set mime type term summary = self._index_file(infile) # set data to be text sample # FIXME: is this the right thing to put in the data? self._set_data(summary) # FIXME: should files be renamed to something generic (0.pdf)? outfile = os.path.join(self.docdir, os.path.basename(infile)) base, full = self.db._basename_for_path(outfile) self._add_path(base) # add it to the cache to be written at sync() if '_infiles' not in dir(self): self._infiles = {} self._infiles[infile] = outfile ######################################## # SOURCES def _purge_sources_prefix(self, source): # purge all terms for a given source prefix prefix = self.db._make_source_prefix(source) for i in self._get_terms(prefix): self._remove_term(prefix, i) self._remove_term(self.db._find_prefix('source'), source) def add_sid(self, sid): """Add source sid to document.""" source, oid = sid.split(':', 1) source = source.lower() # remove any existing terms for this source self._purge_sources_prefix(source) # add a term for the source self._add_term(self.db._find_prefix('source'), source) # add a term for the sid, with source as prefix self._add_term(self.db._make_source_prefix(source), oid) def get_sids(self): """Return a list of sids for document.""" sids = [] for source in self._get_terms(self.db._find_prefix('source')): for oid in self._get_terms(self.db._make_source_prefix(source)): sids.append('%s:%s' % (source, oid)) return sids # BIBTEX KEYS def get_keys(self): """Return a list of bibtex citation keys associated with document.""" prefix = self.db._find_prefix('key') return self._get_terms(prefix) # TAGS def add_tags(self, tags): """Add tags from list to document.""" prefix = self.db._find_prefix('tag') for tag in tags: self._add_term(prefix, tag) def get_tags(self): """Return a list of tags associated with document.""" prefix = self.db._find_prefix('tag') return self._get_terms(prefix) def remove_tags(self, tags): """Remove tags from a document.""" prefix = self.db._find_prefix('tag') for tag in tags: self._remove_term(prefix, tag) # TITLE def _set_title(self, title): pt = self.db._find_prefix('title') for term in self._get_terms(pt): self._remove_term(pt, term) # FIXME: what's the clean way to get these prefixes? for term in self._get_terms('ZS'): self._remove_term('ZS', term) self._gen_terms(pt, title) # AUTHOR def _set_authors(self, authors): pa = self.db._find_prefix('author') for term in self._get_terms(pa): self._remove_term(pa, term) # FIXME: what's the clean way to get these prefixes? for term in self._get_terms('ZA'): self._remove_term('ZA', term) self._gen_terms(pa, authors) # YEAR def _set_year(self, year): # FIXME: this should be a value pass ######################################## # bibtex def get_bibpath(self): """Return path to document bibtex file.""" return os.path.join(self.docdir, 'bibtex') def _set_bibkey(self, key): prefix = self.db._find_prefix('key') for term in self._get_terms(prefix): self._remove_term(prefix, term) self._add_term(prefix, key) def _index_bibentry(self, bibentry): authors = bibentry.get_authors() fields = bibentry.get_fields() if 'title' in fields: self._set_title(fields['title']) if 'year' in fields: self._set_year(fields['year']) if authors: # authors should be a list, so we make a single text string # FIXME: better way to do this? self._set_authors(' '.join(authors)) # add any sources in the bibtex for sid in scan_bibentry_for_sources(bibentry): self.add_sid(sid) # FIXME: index 'keywords' field as regular terms self._set_bibkey(bibentry.key) def add_bibentry(self, bibentry): """Add bibentry object.""" self.bibentry = bibentry self._index_bibentry(self.bibentry) def add_bibtex(self, bibtex): """Add bibtex to document, as string or file path.""" self.add_bibentry(Bibtex(bibtex)[0]) def _load_bib(self): if self.bibentry: return bibpath = self.get_bibpath() if os.path.exists(bibpath): self.bibentry = Bibtex(bibpath)[0] def get_bibtex(self): """Get the bib for document as a bibtex string.""" self._load_bib() if self.bibentry: return self.bibentry.as_string() else: return None def get_bibdata(self): self._load_bib() if self.bibentry: data = self.bibentry.get_fields() data['authors'] = self.bibentry.get_authors() return data else: return None def update_from_bibtex(self): """Update document metadata from document bibtex.""" self._load_bib() self._index_bibentry(self.bibentry) ######################################## def get_title(self): """Get the title from document bibtex.""" self._load_bib() if not self.bibentry: return None fields = self.bibentry.get_fields() if 'title' in fields: return fields['title'] return None def get_urls(self): """Get all URLs associated with document.""" urls = [] # get urls associated with known sources for sid in self.get_sids(): smod = get_source(sid) urls.append(smod.gen_url()) # get urls from bibtex self._load_bib() if self.bibentry: fields = self.bibentry.get_fields() if 'url' in fields: urls.append(fields['url']) if 'adsurl' in fields: urls.append(fields['adsurl']) return urls