Esempio n. 1
0
def parse_collection(collection):
    works = []
    docs = json.loads(collection)
    for doc in docs:
        corpus, nn = doc.split(",")
        c = Corpus(corpus)
        for n in nn.strip("[]").split(", "):
            w = c.work_by_docix(int(n))
            works.append(w)
    return Collection(works=works)
Esempio n. 2
0
def optimize(corpus):
    """Optimize the indexes of a corpus. """

    with click_spinner.spinner():
        c = Corpus(corpus)
        if c.searchable:
            c.optimize()
            click.echo(f"[+] optimized '{corpus}'")
        else:
            click.echo(f"[-] failed")
Esempio n. 3
0
def delete(corpus, docix):
    """Delete a document in a corpus by index number. """

    with click_spinner.spinner():
        c = Corpus(corpus)
        c.delete_by_ix(int(docix))

    if docix in list(c.all_doc_ixs()):
        click.echo(f"[-] failed")
    else:
        click.echo(f"[+] deleted document {docix} of '{corpus}'")
Esempio n. 4
0
def clear(corpus):
    """Clear the indexes of a corpus without deleting them. """

    with click_spinner.spinner():
        c = Corpus(corpus)
        c.clear()

    if c.doc_count_all == 0:
        click.echo(f"[+] cleared '{corpus}'")
    else:
        click.echo("[-] failed")
Esempio n. 5
0
def destroy(corpus):
    """Destroy the indexes and manifest of a corpus. """

    if click.confirm(f"Are you sure?", default=False):
        with click_spinner.spinner():
            c = Corpus(corpus)
            c.destroy()

        if c.searchable:
            click.echo("[-] failed")
        else:
            click.echo(f"[+] destroyed '{corpus}'")
Esempio n. 6
0
def update(corpus, docix, path):
    """Reindex a document by index number. """

    docix = int(docix)
    with click_spinner.spinner():
        c = Corpus(corpus)
        c.update(docix=docix, path=Path(path))

    if c.work_by_docix(docix).searchable:
        click.echo(f"[+] updated document {docix} in '{corpus}'")
    else:
        click.echo(f"[-] failed")
Esempio n. 7
0
def update_by(corpus, **kwargs):
    """Reindex a document by author name and/or title. """

    with click_spinner.spinner():
        kwargs["path"] = Path(kwargs["path"])
        c = Corpus(corpus)
        docix = c.update_by(**kwargs)

    if docix in [doc["docix"] for doc in c.iter_docs()]:
        click.echo(f"[+] updated document {docix} in '{corpus}'")
    else:
        click.echo(f"[-] failed")
Esempio n. 8
0
def delete_by(corpus, **kwargs):
    """Delete documents in a corpus by author name and/or title. """

    with click_spinner.spinner():
        c = Corpus(corpus)
        pre_ndocs = c.doc_count_all
        c.delete_by()
        post_ndocs = c.doc_count_all
        ndocs = pre_ndocs - post_ndocs

    if ndocs == 0:
        click.echo(f"[-] failed")
    else:
        click.echo(f"[+] deleted {ndocs} documents from '{corpus}'")
Esempio n. 9
0
def download(corpus, branch):
    """Download a remote corpus repository. """

    if not corpus:
        for name, meta in REMOTE_CORPORA.items():
            click.echo(f"[-] '{name}' [{meta.repo['origin']}]")
    else:
        if corpus not in REMOTE_CORPORA:
            click.echo(f"[-] no remote location for '{corpus}'")
        else:
            c = Corpus(corpus)
            try:
                c.download(branch)
            except Exception as e:
                click.echo("[-] failed", e)
Esempio n. 10
0
def index(corpus):
    """List indexed works in a corpus. """

    c = Corpus(corpus)
    docs = sorted(c.iter_docs(), key=lambda x: x[0])

    if docs:
        for docix, doc in docs:
            if "author" in doc and "title" in doc:
                click.echo(
                    f"[{doc['docix']}] {doc['author']}, {doc['title']} [{doc['filename']}]"
                )
            else:
                click.echo(f"[{doc['docix']}] {doc['filename']}")
    else:
        click.echo(f"[-] nothing indexed for '{corpus}'")
Esempio n. 11
0
def search(q, collection: Collection):
    works = [json.loads(item) for item in collection]
    c = Collection(works=[
        Corpus(work["corpus"]).work_by_docix(work["docix"][0])
        for work in works
    ])
    searcher = Searcher(c)
    s = searcher.search(q)
    return s.to_json()
Esempio n. 12
0
def download_by(corpus, author, title):
    """Download documents by author and title from a remote corpus. """

    if corpus not in REMOTE_CORPORA:
        click.echo(f"[-] no remote location for '{corpus}'")
    else:
        try:
            c = Corpus(corpus)
            if not (author or title):
                manifest = c.remote_manifest()
                for docix, meta in manifest.items():
                    click.echo(
                        f"[{docix}] {meta['author']}, {meta['title']} [{meta['filename']}]"
                    )
            else:
                n = len(c.manifest)
                with click_spinner.spinner():
                    c.download_by(author, title)
                click.echo(f"[+] downloaded {len(c.manifest) - n} documents")
        except Exception as e:
            click.echo("[-] failed", e)
Esempio n. 13
0
def download_corpora():
    form = request.form

    name = form.get("name", None)

    if name:
        corpus = Corpus(name)
        success = corpus.download()
    else:
        success = None

    response = {
        "corpora": [
            Corpus(corpus) for corpus, meta in manifest.items()
            if meta.repo["location"] == "remote"
        ],
        "name":
        name if name else "",
        "success":
        success if success else "",
    }
    return render_template("download_corpora.html", **response)
Esempio n. 14
0
def download_by_docix(corpus, docix):
    """Download a document by number from a remote corpus. """

    if corpus not in REMOTE_CORPORA:
        click.echo(f"[-] no remote location for '{corpus}'")
    else:
        c = Corpus(corpus)
        try:
            if not docix:
                manifest = c.remote_manifest()
                for docix, meta in manifest.items():
                    click.echo(
                        f"[{docix}] {meta['author']}, {meta['title']} [{meta['filename']}]"
                    )
            else:
                with click_spinner.spinner():
                    c.download_by_docix(int(docix))
                meta = c.manifest[docix]
                click.echo(
                    f"[{docix}] {meta['author']}, {meta['title']} [{meta['filename']}]"
                )
        except Exception as e:
            click.echo("[-] failed", e)
Esempio n. 15
0
def history():
    db.connect()
    history = []
    for h in Search.select():
        history.append(h)
    db.close()

    kwargs = request.args
    id = kwargs.get("id")
    db.connect()
    s = Search.get_by_id(id)
    db.close()

    works = []
    work_ids = json.loads(s.collection)
    for work_id in work_ids:
        corpus, docix = work_id.split(",")
        c = Corpus(corpus)
        w = c.work_by_docix(int(docix))
        works.append(w)

    results = [
        r.html for r in SearchResult.select().where(SearchResult.search == s)
    ]

    response = {
        "version": __version__,
        "collection": s.collection,
        "works": works,
        "corpora": _corpora,
        "query": s.query,
        "history": history,
        "results": results,
        "count": len(s.results),
        "id": id
    }
    return render_template("index.html", **response)
Esempio n. 16
0
def index(corpus, filename):
    c = Corpus(corpus)

    w = Work(corpus=c)
    docix = w.indexer.from_file(Path(filename),
                                destructive=True,
                                optimize=True)

    return {
        "corpus": corpus,
        "docix": docix,
        "filename": filename,
        "author": w.author,
        "title": w.title,
    }
Esempio n. 17
0
def lexicon(corpus, fieldname):
    """List the contents of an index by field name. """

    c = Corpus(corpus)

    lexicon = set()
    for reader in c.readers:
        with CylleneusSearcher(reader) as searcher:
            lexicon.update(list(searcher.lexicon(fieldname)))
    if lexicon:
        click.echo(
            f"[+] lexicon '{fieldname}' of '{corpus}': {len(lexicon)} items"
        )
        click.echo_via_pager(
            "\n".join([i.decode("utf8") for i in sorted(lexicon)])
        )
    else:
        click.echo(f"[-] failed")
Esempio n. 18
0
def add(corpus, path, author, title):
    """Index a specific file. """

    with click_spinner.spinner():
        c = Corpus(corpus)
        pre_ndocs = c.doc_count_all

        w = Work(c, author, title)
        _ = w.indexer.from_file(Path(path))

    post_ndocs = c.doc_count_all
    ndocs = post_ndocs - pre_ndocs
    if post_ndocs > pre_ndocs:
        click.echo(
            f"[+] added {ndocs} document{'s' if ndocs > 1 else ''} to '{corpus}'"
        )
    else:
        click.echo("[-] failed")
Esempio n. 19
0
    def test_query_parsing(self):
        """Test query parsing."""

        from cylleneus.corpus import Corpus
        from cylleneus.engine.qparser.default import CylleneusQueryParser

        queries = [
            ("perseus", "{=Anatomy}"),
            ("dcs", "[=the Sustainer]"),
            ("perseus", "(<gelidus> OR <gelida>) AND <pruina>"),
            ("latin_library", "'sed'"),
            ("proiel", ":ACC.PL."),
            ("proiel", "<habeo>"),
            ("agldt", "<animus>|ABL.SG."),
            ("camena", "[en?war]"),
            ("camena", "[it?guerra]"),
            ("digiliblt", "{611}"),
            ("perseus", '"cum clamore"'),
            ("perseus", '"cum <clamor>"'),
            ("perseus", '"cum <clamor>|ABL."'),
            ("perseus", '"cum magno" <calor>'),
            ("lasla", '":VB. milites"'),
            ("lasla", '":VB. <miles>"'),
            ("proiel", "</::bellum>"),
            ("latin_library", "[!::en?cowardice]"),
            ("perseus_xml", "[en?courage]|ABL.PL."),
            ("perseus", "[@::n#04478900]"),
            ("latin_library", "opt*"),
            ("atlas", "<τεύχω>"),
            ("diorisis", "<Σωκράτης>"),
            ("lasla", "/ablative absolute/"),
            ("lasla", "/interrogative/"),
            ("lasla", "/QVOMINVS/"),
            ("proiel", "/predicate/"),
            ("lasla", "/subordinating conjunction/"),
            ("proiel", "/adverbial/"),
            ("proiel", "/adnominal argument/"),
        ]

        for corpus, query in queries:
            c = Corpus(corpus)
            p = CylleneusQueryParser("form", c.schema)
            q = p.parse(query)
            assert q
Esempio n. 20
0
def create(corpus, destructive, optimize):
    """Create all corpus indexes from source files. """

    with click_spinner.spinner():
        c = Corpus(corpus)

        for file in c.text_dir.glob(c.glob):
            w = Work(corpus=c)
            _ = w.indexer.from_file(
                file, destructive=destructive, optimize=optimize
            )

    ndocs = c.doc_count_all
    if ndocs > 0:
        click.echo(
            f"[+] created '{corpus}' with {ndocs} document{'s' if ndocs > 1 else ''}"
        )
    else:
        click.echo("[-] failed")
Esempio n. 21
0
def import_text(corpus, author, title, filename, content):
    kwargs = {
        "corpus": corpus,
        "author": author,
        "title": title,
        "filename": filename,
    }

    try:
        c = Corpus(corpus)
        w = Work(c, author=author, title=title)
        ndocs = c.doc_count_all
        w.indexer.from_string(content=content, **kwargs)

        if c.doc_count_all > ndocs:
            success = True
        else:
            success = False
    except Exception as e:
        success = False
    return success
Esempio n. 22
0
def verify(corpus, verbose, dry_run):
    """Verify the integrity of corpus indexes and manifest. """

    c = Corpus(corpus)
    manifest = c.manifest
    if c.index_dir.exists() and click.confirm(
        f"{len(manifest)} documents manifested for corpus '{c.name}'. "
        + (f"This might take a while! " if len(manifest) > 30 else "")
        + f"Proceed?",
        default=True,
    ):
        verified = []
        passes = fixes = adds = orphans = 0
        missing = {}

        with click.progressbar(
            manifest,
            length=len(manifest),
            show_percent=True,
            label=f"Verifying '{c.name}'",
        ) as bar:
            for item in bar:
                (
                    status,
                    (docix, author, title, filename, info),
                ) = c.verify_by_docix(item, dry_run=dry_run)
                msg = f"[{docix}] {author}, {title} ({filename})"
                if status == 0:
                    msg += ", passed!"
                    passes += 1
                elif status == 1:
                    msg += ", fixed in manifest"
                    fixes += 1
                elif status == 2:
                    msg += ", added to manifest"
                    adds += 1
                elif status == 3:
                    msg += ", deleted orphaned index files"
                    orphans += 1
                elif status == 4:
                    msg = (
                        f"[{docix}] {manifest[item]['author']}, {manifest[item]['title']} ("
                        f"{manifest[item]['filename']})"
                    )
                    msg += ", missing index files!"
                    missing[item] = manifest[item]
                if info is not None and cylleneus.settings.DEBUG_LEVEL:
                    msg += f" (= {info})"
                verified.append((docix, msg))
        if verbose and len(verified) != 0:
            click.echo_via_pager(
                "\n".join(
                    [
                        ("*" if dry_run else "") + item[1]
                        for item in sorted(verified, key=lambda item: item[0])
                    ]
                )
            )
        click.echo(
            f"[-] '{corpus}': {len(manifest)} checked, {passes} passed"
            + (f", {fixes} fixed in manifest" if fixes else "")
            + (f", {adds} added to manifest, " if adds else "")
            + (f", {orphans} orphaned files deleted" if orphans else "")
            + (
                f" -- changes have NOT been committed!"
                if dry_run and passes < len(manifest)
                else ""
            )
        )
        if len(missing) != 0 and click.confirm(
            f"Try to re-index {len(missing)} missing documents?", default=True,
        ):
            for docix, meta in missing.items():
                if meta["filename"]:
                    path = c.text_dir / Path(meta["filename"])
                    with click_spinner.spinner():
                        updated_docix = (
                            c.update(docix, path) if not dry_run else None
                        )
                    if updated_docix is not None:
                        click.echo(
                            f"[{updated_docix}] {meta['author']}, {meta['title']} ({meta['filename']}), "
                            f"index created!"
                        )
                    else:
                        if dry_run:
                            click.echo(
                                f"*[-] {meta['author']}, {meta['title']} "
                                f"({meta['filename']}) -- document NOT re-indexed!"
                            )
                        else:
                            click.echo(
                                f"[-] {meta['author']}, {meta['title']} ({meta['filename']}), failed"
                            )
Esempio n. 23
0
def collection_select():
    response = {"corpora": [Corpus(corpus) for corpus in _corpora]}
    return render_template("collection_select.html", **response)
Esempio n. 24
0
def search():
    if request.method == "POST":
        form = request.form
    else:
        form = request.args

    collection = form.get("collection")
    query = form.get("query")

    works = []

    ids = json.loads(collection)
    for id in ids:
        corpus, n = id.split(",")
        c = Corpus(corpus)
        w = c.work_by_docix(int(n))
        works.append(w)

    results = search_request(works, query)

    if results:
        count_matches, count_documents, _ = results.count
        db.connect()
        try:
            s = (Search.get(query=query, collection=collection), )
        except Search.DoesNotExist:
            names = [f"{work.author}, {work.title}" for work in works]
            s = Search.create(
                query=query,
                collection=collection,
                prettified=f"{'; '.join(names)}",
                count_matches=count_matches,
                count_documents=count_documents,
                minscore=results.minscore,
                top=results.top,
                start_dt=results.start_dt,
                end_dt=results.end_dt,
                maxchars=results.maxchars,
                surround=results.surround,
            )
            s.save()
        finally:
            for href in results.highlights:
                r = SearchResult.get_or_create(search=s,
                                               html=next(as_html([
                                                   href,
                                               ])))
                if r[1]:
                    r[0].save()
        db.close()

    db.connect()
    history = []
    for h in Search.select():
        history.append(h)
    db.close()

    response = {
        "version": __version__,
        "collection": collection,
        "works": works,
        "corpora": _corpora,
        "query": query,
        "history": history,
        "results": as_html(results.highlights) if results else [],
        "count": results.count if results else (0, 0),
        "id": history[-1].id
    }
    return render_template("index.html", **response)
Esempio n. 25
0
def manifest(corpus):
    c = Corpus(corpus)
    return c.manifest