Ejemplo n.º 1
0
def importbib(db, bibfile, tags=[], overwrite=False):
    errors = []

    sources = Sources()

    for entry in sorted(Bibtex(bibfile), key=lambda entry: entry.key):
        print >> sys.stderr, entry.key

        try:
            docs = []

            # check for doc with this bibkey
            bdoc = db.doc_for_bib(entry.key)
            if bdoc:
                docs.append(bdoc)

            # check for known sids
            for source in sources.scan_bibentry(entry):
                sdoc = db.doc_for_source(source.sid)
                # FIXME: why can't we match docs in list?
                if sdoc and sdoc.docid not in [doc.docid for doc in docs]:
                    docs.append(sdoc)

            if len(docs) == 0:
                doc = Document(db)
            elif len(docs) > 0:
                if len(docs) > 1:
                    print >> sys.stderr, "  Multiple distinct docs found for entry.  Using first found."
                doc = docs[0]
                print >> sys.stderr, "  Updating id:%d..." % (doc.docid)

            doc.add_bibentry(entry)

            filepath = entry.get_file()
            if filepath:
                print >> sys.stderr, "  Adding file: %s" % filepath
                doc.add_file(filepath)

            doc.add_tags(tags)

            doc.sync()

        except BibtexError as e:
            print >> sys.stderr, "  Error processing entry %s: %s" % (
                entry.key, e)
            print >> sys.stderr
            errors.append(entry.key)

    if errors:
        print >> sys.stderr
        print >> sys.stderr, "Failed to import %d" % (len(errors)),
        if len(errors) == 1:
            print >> sys.stderr, "entry",
        else:
            print >> sys.stderr, "entries",
        print >> sys.stderr, "from bibtex:"
        for error in errors:
            print >> sys.stderr, "  %s" % (error)
        sys.exit(1)
    else:
        sys.exit(0)
Ejemplo n.º 2
0
class Document():
    """Represents a single Xapers document."""
    def __init__(self, db, xapian_doc=None, docid=None):
        # Xapers db
        self.db = db

        # if Xapian doc provided, initiate for that document
        if xapian_doc:
            self.xapian_doc = xapian_doc
            self.docid = xapian_doc.get_docid()

        # else, create a new empty document
        # document won't be added to database until sync is called
        else:
            self.xapian_doc = xapian.Document()
            # use specified docid if provided
            if docid:
                if docid in self.db:
                    raise DocumentError('Document already exists for id %d.' %
                                        docid)
                self.docid = docid
            else:
                self.docid = self.db._generate_docid()
            self._add_term(self.db._find_prefix('id'), self.docid)

        # specify a directory in the Xapers root for document data
        self.docdir = os.path.join(self.db.root, '%010d' % self.docid)

        self.bibentry = None

        self._infiles = {}

    def get_docid(self):
        """Return document id of document."""
        return self.docid

    def _make_docdir(self):
        if os.path.exists(self.docdir):
            if not os.path.isdir(self.docdir):
                raise DocumentError(
                    'File exists at intended docdir location: %s' %
                    self.docdir)
        else:
            os.makedirs(self.docdir)

    def _write_files(self):
        for name, data in self._infiles.iteritems():
            path = os.path.join(self.docdir, name)
            with open(path, 'w') as f:
                f.write(data)

    def _write_bibfile(self):
        bibpath = self.get_bibpath()
        # reload bibtex only if we have new files
        paths = self.get_fullpaths()
        if paths:
            self._load_bib()
        if self.bibentry:
            # we put only the first file in the bibtex
            # FIXME: does jabref/mendeley spec allow for multiple files?
            if paths and not self.bibentry.get_file():
                self.bibentry.set_file(paths[0])
            self.bibentry.to_file(bibpath)

    def _write_tagfile(self):
        with open(os.path.join(self.docdir, 'tags'), 'w') as f:
            for tag in self.get_tags():
                f.write(tag)
                f.write('\n')

    def _rm_docdir(self):
        if os.path.exists(self.docdir) and os.path.isdir(self.docdir):
            shutil.rmtree(self.docdir)

    def sync(self):
        """Sync document to database."""
        # FIXME: add value for modification time
        # FIXME: catch db not writable errors
        try:
            self._make_docdir()
            self._write_files()
            self._write_bibfile()
            self._write_tagfile()
            self.db.replace_document(self.docid, self.xapian_doc)
        except:
            self._rm_docdir()
            raise

    def purge(self):
        """Purge document from database and root."""
        # FIXME: catch db not writable errors
        try:
            self.db.delete_document(self.docid)
        except xapian.DocNotFoundError:
            pass
        self._rm_docdir()
        self.docid = None

    ########################################
    # internal stuff

    # add an individual prefix'd term for the document
    def _add_term(self, prefix, value):
        term = '%s%s' % (prefix, value)
        self.xapian_doc.add_term(term)

    # remove an individual prefix'd term for the document
    def _remove_term(self, prefix, value):
        term = '%s%s' % (prefix, value)
        try:
            self.xapian_doc.remove_term(term)
        except xapian.InvalidArgumentError:
            pass

    # Parse 'text' and add a term to 'message' for each parsed
    # word. Each term will be added both prefixed (if prefix is not
    # None) and non-prefixed.
    # http://xapian.org/docs/bindings/python/
    # http://xapian.org/docs/quickstart.html
    # http://www.flax.co.uk/blog/2009/04/02/xapian-search-architecture/
    def _gen_terms(self, prefix, text):
        term_gen = self.db.term_gen
        term_gen.set_document(self.xapian_doc)
        if prefix:
            term_gen.index_text(text, 1, prefix)
        term_gen.index_text(text)

    # return a list of terms for prefix
    def _term_iter(self, prefix=None):
        term_iter = iter(self.xapian_doc)
        if prefix:
            plen = len(prefix)
            term = term_iter.skip_to(prefix)
            if not term.term.startswith(prefix):
                return
            yield term.term[plen:]
        for term in term_iter:
            if prefix:
                if not term.term.startswith(prefix):
                    break
                yield term.term[plen:]
            else:
                yield term.term

    def term_iter(self, name=None):
        """Iterator over all terms in the document.

        If a prefix is provided, will iterate over only the prefixed
        terms, and the prefix will be removed from the returned terms.

        """
        prefix = None
        if name:
            prefix = self.db._find_prefix(name)
            if not prefix:
                prefix = name
        return self._term_iter(prefix)

    # set the data object for the document
    def _set_data(self, text):
        self.xapian_doc.set_data(text)

    def get_data(self):
        """Get data object for document."""
        return self.xapian_doc.get_data()

    ########################################
    # files

    def add_file_data(self, name, data):
        """Add a file data to document.

        'name' is the name of the file, 'data is the file data.

        File will not copied in to docdir until sync().
        """
        # FIXME: set mime type term

        # parse the file data into text
        text = parse_data(data)

        # generate terms from the text
        self._gen_terms(None, text)

        # set data to be text sample
        # FIXME: is this the right thing to put in the data?
        summary = text[0:997] + '...'
        self._set_data(summary)

        # FIXME: should files be renamed to something generic (0.pdf)?
        prefix = self.db._find_prefix('file')
        self._add_term(prefix, name)

        # add it to the cache to be written at sync()
        self._infiles[name] = data

    def add_file(self, infile):
        """Add a file to document.

        Added file will have the same name.

        File will not copied in to docdir until sync().
        """
        with open(infile, 'r') as f:
            data = f.read()
        name = os.path.basename(infile)
        self.add_file_data(name, data)

    def get_files(self):
        """Return files associated with document."""
        return list(self.term_iter('file'))

    def get_fullpaths(self):
        """Return fullpaths of files associated with document."""
        list = []
        for path in self.get_files():
            # FIXME: this is a hack for old path specifications that
            # included the docdir
            path = os.path.basename(path)
            list.append(os.path.join(self.docdir, path))
        return list

    ########################################

    # SOURCES
    def _purge_sources_prefix(self, source):
        # purge all terms for a given source prefix
        prefix = self.db._make_source_prefix(source)
        for i in self._term_iter(prefix):
            self._remove_term(prefix, i)
        self._remove_term(self.db._find_prefix('source'), source)

    def add_sid(self, sid):
        """Add source sid to document."""
        source, oid = sid.split(':', 1)
        source = source.lower()
        # remove any existing terms for this source
        self._purge_sources_prefix(source)
        # add a term for the source
        self._add_term(self.db._find_prefix('source'), source)
        # add a term for the sid, with source as prefix
        self._add_term(self.db._make_source_prefix(source), oid)

    def get_sids(self):
        """Return a list of sids for document."""
        sids = []
        for source in self.term_iter('source'):
            for oid in self._term_iter(self.db._make_source_prefix(source)):
                sids.append('%s:%s' % (source, oid))
        return sids

    # TAGS
    def add_tags(self, tags):
        """Add tags from list to document."""
        prefix = self.db._find_prefix('tag')
        for tag in tags:
            self._add_term(prefix, tag)

    def get_tags(self):
        """Return a list of tags associated with document."""
        return list(self.term_iter('tag'))

    def remove_tags(self, tags):
        """Remove tags from a document."""
        prefix = self.db._find_prefix('tag')
        for tag in tags:
            self._remove_term(prefix, tag)

    # TITLE
    def _set_title(self, title):
        pt = self.db._find_prefix('title')
        for term in self._term_iter(pt):
            self._remove_term(pt, term)
        # FIXME: what's the clean way to get these prefixes?
        for term in self._term_iter('ZS'):
            self._remove_term('ZS', term)
        self._gen_terms(pt, title)

    # AUTHOR
    def _set_authors(self, authors):
        pa = self.db._find_prefix('author')
        for term in self._term_iter(pa):
            self._remove_term(pa, term)
        # FIXME: what's the clean way to get these prefixes?
        for term in self._term_iter('ZA'):
            self._remove_term('ZA', term)
        self._gen_terms(pa, authors)

    # YEAR
    def _set_year(self, year):
        # FIXME: what to do if year is not an int?
        try:
            year = int(year)
        except ValueError:
            pass
        prefix = self.db._find_prefix('year')
        for term in self._term_iter(prefix):
            self._remove_term(prefix, year)
        self._add_term(prefix, year)
        facet = self.db._find_facet('year')
        self.xapian_doc.add_value(facet, xapian.sortable_serialise(year))

    ########################################
    # bibtex

    def get_bibpath(self):
        """Return path to document bibtex file."""
        return os.path.join(self.docdir, 'bibtex')

    def _set_bibkey(self, key):
        prefix = self.db._find_prefix('key')
        for term in self._term_iter(prefix):
            self._remove_term(prefix, term)
        self._add_term(prefix, key)

    def _index_bibentry(self, bibentry):
        authors = bibentry.get_authors()
        fields = bibentry.get_fields()
        if 'title' in fields:
            self._set_title(fields['title'])
        if 'year' in fields:
            self._set_year(fields['year'])
        if authors:
            # authors should be a list, so we make a single text string
            # FIXME: better way to do this?
            self._set_authors(' '.join(authors))

        # add any sources in the bibtex
        for source in Sources().scan_bibentry(bibentry):
            self.add_sid(source.sid)

        # FIXME: index 'keywords' field as regular terms

        self._set_bibkey(bibentry.key)

    def add_bibentry(self, bibentry):
        """Add bibentry object."""
        self.bibentry = bibentry
        self._index_bibentry(self.bibentry)

    def add_bibtex(self, bibtex):
        """Add bibtex to document, as string or file path."""
        self.add_bibentry(Bibtex(bibtex)[0])

    def _load_bib(self):
        if self.bibentry:
            return
        bibpath = self.get_bibpath()
        if os.path.exists(bibpath):
            self.bibentry = Bibtex(bibpath)[0]

    def get_bibtex(self):
        """Get the bib for document as a bibtex string."""
        bibpath = self.get_bibpath()
        if os.path.exists(bibpath):
            with open(bibpath, 'r') as f:
                bibtex = f.read().decode('utf-8')
            return bibtex.strip()

    def get_bibdata(self):
        self._load_bib()
        if self.bibentry:
            data = self.bibentry.get_fields()
            data['authors'] = self.bibentry.get_authors()
            return data

    def update_from_bibtex(self):
        """Update document metadata from document bibtex."""
        self._load_bib()
        self._index_bibentry(self.bibentry)

    ########################################

    def get_key(self):
        self._load_bib()
        if not self.bibentry:
            return
        return self.bibentry.key

    def get_title(self):
        """Get the title from document bibtex."""
        self._load_bib()
        if not self.bibentry:
            return
        fields = self.bibentry.get_fields()
        if 'title' in fields:
            return fields['title']

    def get_year(self):
        """Get the title from document bibtex."""
        self._load_bib()
        if not self.bibentry:
            return
        fields = self.bibentry.get_fields()
        if 'year' in fields:
            return fields['year']

    def get_urls(self):
        """Get all URLs associated with document."""
        sources = Sources()
        urls = []
        # get urls associated with known sources
        for sid in self.get_sids():
            urls.append(sources[sid].url)
        # get urls from bibtex
        self._load_bib()
        if self.bibentry:
            fields = self.bibentry.get_fields()
            if 'url' in fields:
                urls.append(fields['url'])
            if 'adsurl' in fields:
                urls.append(fields['adsurl'])
        return urls
Ejemplo n.º 3
0
 def _load_bib(self):
     if self.bibentry:
         return
     bibpath = self.get_bibpath()
     if os.path.exists(bibpath):
         self.bibentry = Bibtex(bibpath)[0]
Ejemplo n.º 4
0
 def add_bibtex(self, bibtex):
     """Add bibtex to document, as string or file path."""
     self.add_bibentry(Bibtex(bibtex)[0])
Ejemplo n.º 5
0
class Document():
    """Represents a single Xapers document."""

    def __init__(self, db, doc=None, docid=None):
        # Xapers db
        self.db = db
        self.root = self.db.root

        # if Xapian doc provided, initiate for that document
        if doc:
            self.doc = doc
            self.docid = str(doc.get_docid())

        # else, create a new empty document
        # document won't be added to database until sync is called
        else:
            self.doc = xapian.Document()
            # use specified docid if provided
            if docid:
                if self.db[docid]:
                    raise DocumentError('Document already exists for id %s.' % docid)
                self.docid = docid
            else:
                self.docid = str(self.db._generate_docid())
            self._add_term(self.db._find_prefix('id'), self.docid)

        # specify a directory in the Xapers root for document data
        self.docdir = os.path.join(self.root, '%010d' % int(self.docid))

        #
        self.bibentry = None

    def get_docid(self):
        """Return document id of document."""
        return self.docid

    def _make_docdir(self):
        if os.path.exists(self.docdir):
            if not os.path.isdir(self.docdir):
                raise DocumentError('File exists at intended docdir location: %s' % self.docdir)
        else:
            os.makedirs(self.docdir)

    def _write_files(self):
        if '_infiles' in dir(self):
            for infile, outfile in self._infiles.iteritems():
                try:
                    shutil.copyfile(infile, outfile)
                except shutil.Error:
                    pass

    def _write_bibfile(self):
        bibpath = self.get_bibpath()
        # reload bibtex only if we have new files
        paths = self.get_fullpaths()
        if paths:
            self._load_bib()
        if self.bibentry:
            # we put only the first file in the bibtex
            # FIXME: does jabref/mendeley spec allow for multiple files?
            if paths and not self.bibentry.get_file():
                self.bibentry.set_file(paths[0])
            self.bibentry.to_file(bibpath)

    def _write_tagfile(self):
        with open(os.path.join(self.docdir, 'tags'), 'w') as f:
            for tag in self.get_tags():
                f.write(tag)
                f.write('\n')

    def _rm_docdir(self):
        if os.path.exists(self.docdir) and os.path.isdir(self.docdir):
            shutil.rmtree(self.docdir)

    def sync(self):
        """Sync document to database."""
        # FIXME: add value for modification time
        # FIXME: catch db not writable errors
        try:
            self._make_docdir()
            self._write_files()
            self._write_bibfile()
            self._write_tagfile()
            self.db.replace_document(self.docid, self.doc)
        except:
            self._rm_docdir()
            raise

    def purge(self):
        """Purge document from database and root."""
        # FIXME: catch db not writable errors
        try:
            self.db.delete_document(self.docid)
        except xapian.DocNotFoundError:
            pass
        self._rm_docdir()
        self.docid = None

    ########################################
    # internal stuff

    # add an individual prefix'd term for the document
    def _add_term(self, prefix, value):
        term = '%s%s' % (prefix, value)
        self.doc.add_term(term)

    # remove an individual prefix'd term for the document
    def _remove_term(self, prefix, value):
        term = '%s%s' % (prefix, value)
        try:
            self.doc.remove_term(term)
        except xapian.InvalidArgumentError:
            pass

    # Parse 'text' and add a term to 'message' for each parsed
    # word. Each term will be added both prefixed (if prefix_name is
    # not NULL) and also non-prefixed).
    # http://xapian.org/docs/bindings/python/
    # http://xapian.org/docs/quickstart.html
    # http://www.flax.co.uk/blog/2009/04/02/xapian-search-architecture/
    def _gen_terms(self, prefix, text):
        term_gen = self.db.term_gen
        term_gen.set_document(self.doc)
        if prefix:
            term_gen.index_text(text, 1, prefix)
        term_gen.index_text(text)
            
    # return a list of terms for prefix
    # FIXME: is this the fastest way to do this?
    def _get_terms(self, prefix):
        list = []
        for term in self.doc:
            if term.term.find(prefix.encode("utf-8")) == 0:
                index = len(prefix)
                list.append(term.term[index:])
        return list

    # set the data object for the document
    def _set_data(self, text):
        self.doc.set_data(text)

    def get_data(self):
        """Get data object for document."""
        return self.doc.get_data()

    ########################################
    # files

    # index file for the document
    def _index_file(self, path):
        text = parse_file(path)

        self._gen_terms(None, text)

        summary = text[0:997].translate(None, '\n') + '...'

        return summary

    def _add_path(self, path):
        base, full = self.db._basename_for_path(path)
        prefix = self.db._find_prefix('file')
        self._add_term(prefix, base)

    def _get_paths(self):
        return self._get_terms(self.db._find_prefix('file'))

    def get_fullpaths(self):
        """Return fullpaths associated with document."""
        list = []
        for path in self._get_paths():
            # FIXME: this is a hack for old bad path specifications and should be removed
            if path.find(self.root) == 0:
                index = len(self.root) + 1
                path = path[index:]
            path = path.lstrip('/')
            # FIXME
            base, full = self.db._basename_for_path(path)
            list.append(full)
        return list

    def add_file(self, infile):
        """Add a file to document.
File will not copied in to docdir until sync()."""

        # FIXME: should load entire file into {name: file} to be
        # written as file>docdir/name

        # FIXME: set mime type term

        summary = self._index_file(infile)

        # set data to be text sample
        # FIXME: is this the right thing to put in the data?
        self._set_data(summary)

        # FIXME: should files be renamed to something generic (0.pdf)?
        outfile = os.path.join(self.docdir, os.path.basename(infile))

        base, full = self.db._basename_for_path(outfile)

        self._add_path(base)

        # add it to the cache to be written at sync()
        if '_infiles' not in dir(self):
            self._infiles = {}
        self._infiles[infile] = outfile


    ########################################

    # SOURCES
    def _purge_sources_prefix(self, source):
        # purge all terms for a given source prefix
        prefix = self.db._make_source_prefix(source)
        for i in self._get_terms(prefix):
            self._remove_term(prefix, i)
        self._remove_term(self.db._find_prefix('source'), source)

    def add_sid(self, sid):
        """Add source sid to document."""
        source, oid = sid.split(':', 1)
        source = source.lower()
        # remove any existing terms for this source
        self._purge_sources_prefix(source)
        # add a term for the source
        self._add_term(self.db._find_prefix('source'), source)
        # add a term for the sid, with source as prefix
        self._add_term(self.db._make_source_prefix(source), oid)

    def get_sids(self):
        """Return a list of sids for document."""
        sids = []
        for source in self._get_terms(self.db._find_prefix('source')):
            for oid in self._get_terms(self.db._make_source_prefix(source)):
                sids.append('%s:%s' % (source, oid))
        return sids

    # BIBTEX KEYS
    def get_keys(self):
        """Return a list of bibtex citation keys associated with document."""
        prefix = self.db._find_prefix('key')
        return self._get_terms(prefix)

    # TAGS
    def add_tags(self, tags):
        """Add tags from list to document."""
        prefix = self.db._find_prefix('tag')
        for tag in tags:
            self._add_term(prefix, tag)

    def get_tags(self):
        """Return a list of tags associated with document."""
        prefix = self.db._find_prefix('tag')
        return self._get_terms(prefix)

    def remove_tags(self, tags):
        """Remove tags from a document."""
        prefix = self.db._find_prefix('tag')
        for tag in tags:
            self._remove_term(prefix, tag)

    # TITLE
    def _set_title(self, title):
        pt = self.db._find_prefix('title')
        for term in self._get_terms(pt):
            self._remove_term(pt, term)
        # FIXME: what's the clean way to get these prefixes?
        for term in self._get_terms('ZS'):
            self._remove_term('ZS', term)
        self._gen_terms(pt, title)

    # AUTHOR
    def _set_authors(self, authors):
        pa = self.db._find_prefix('author')
        for term in self._get_terms(pa):
            self._remove_term(pa, term)
        # FIXME: what's the clean way to get these prefixes?
        for term in self._get_terms('ZA'):
            self._remove_term('ZA', term)
        self._gen_terms(pa, authors)

    # YEAR
    def _set_year(self, year):
        # FIXME: this should be a value
        pass

    ########################################
    # bibtex

    def get_bibpath(self):
        """Return path to document bibtex file."""
        return os.path.join(self.docdir, 'bibtex')

    def _set_bibkey(self, key):
        prefix = self.db._find_prefix('key')
        for term in self._get_terms(prefix):
            self._remove_term(prefix, term)
        self._add_term(prefix, key)

    def _index_bibentry(self, bibentry):
        authors = bibentry.get_authors()
        fields = bibentry.get_fields()
        if 'title' in fields:
            self._set_title(fields['title'])
        if 'year' in fields:
            self._set_year(fields['year'])
        if authors:
            # authors should be a list, so we make a single text string
            # FIXME: better way to do this?
            self._set_authors(' '.join(authors))

        # add any sources in the bibtex
        for sid in scan_bibentry_for_sources(bibentry):
            self.add_sid(sid)

        # FIXME: index 'keywords' field as regular terms

        self._set_bibkey(bibentry.key)

    def add_bibentry(self, bibentry):
        """Add bibentry object."""
        self.bibentry = bibentry
        self._index_bibentry(self.bibentry)

    def add_bibtex(self, bibtex):
        """Add bibtex to document, as string or file path."""
        self.add_bibentry(Bibtex(bibtex)[0])

    def _load_bib(self):
        if self.bibentry:
            return
        bibpath = self.get_bibpath()
        if os.path.exists(bibpath):
            self.bibentry = Bibtex(bibpath)[0]

    def get_bibtex(self):
        """Get the bib for document as a bibtex string."""
        self._load_bib()
        if self.bibentry:
            return self.bibentry.as_string()
        else:
            return None

    def get_bibdata(self):
        self._load_bib()
        if self.bibentry:
            data = self.bibentry.get_fields()
            data['authors'] = self.bibentry.get_authors()
            return data
        else:
            return None

    def update_from_bibtex(self):
        """Update document metadata from document bibtex."""
        self._load_bib()
        self._index_bibentry(self.bibentry)

    ########################################

    def get_title(self):
        """Get the title from document bibtex."""
        self._load_bib()
        if not self.bibentry:
            return None
        fields = self.bibentry.get_fields()
        if 'title' in fields:
            return fields['title']
        return None

    def get_urls(self):
        """Get all URLs associated with document."""
        urls = []
        # get urls associated with known sources
        for sid in self.get_sids():
            smod = get_source(sid)
            urls.append(smod.gen_url())
        # get urls from bibtex
        self._load_bib()
        if self.bibentry:
            fields = self.bibentry.get_fields()
            if 'url' in fields:
                urls.append(fields['url'])
            if 'adsurl' in fields:
                urls.append(fields['adsurl'])
        return urls
Ejemplo n.º 6
0
 def _load_bib(self):
     if self.bibentry:
         return
     bibpath = self.get_bibpath()
     if os.path.exists(bibpath):
         self.bibentry = Bibtex(bibpath)[0]