Example #1
0
def _get_text_version_file(format, title, lang, versionTitle):
    from sefaria.export import text_is_copyright, make_json, make_text, prepare_merged_text_for_export, prepare_text_for_export, export_merged_csv, export_version_csv

    assert lang in ["en", "he"]
    assert format in ["json", "csv", "txt", "plain.txt"]
    merged = versionTitle == "merged"
    index = library.get_index(title)

    if merged:
        if format == "csv" and merged:
            content = export_merged_csv(index, lang)

        elif format == "json" and merged:
            content = make_json(
                prepare_merged_text_for_export(title, lang=lang))

        elif format == "txt" and merged:
            content = make_text(
                prepare_merged_text_for_export(title, lang=lang))

        elif format == "plain.txt" and merged:
            content = make_text(prepare_merged_text_for_export(title,
                                                               lang=lang),
                                strip_html=True)

    else:
        version_query = {
            "title": title,
            "language": lang,
            "versionTitle": versionTitle
        }

        if format == "csv":
            version = Version().load(version_query)
            assert version, "Can not find version of {} in {}: {}".format(
                title, lang, versionTitle)
            assert not version.is_copyrighted(
            ), "Cowardly refusing to export copyrighted text."
            content = export_version_csv(index, [version])
        else:
            version_object = db.texts.find_one(version_query)
            assert version_object, "Can not find version of {} in {}: {}".format(
                title, lang, versionTitle)
            assert not text_is_copyright(
                version_object
            ), "Cowardly refusing to export copyrighted text."

            if format == "json":
                content = make_json(prepare_text_for_export(version_object))

            elif format == "txt":
                content = make_text(prepare_text_for_export(version_object))

            elif format == "plain.txt":
                content = make_text(prepare_text_for_export(version_object),
                                    strip_html=True)

    return content
Example #2
0
def text_download_api(request, format, title, lang, versionTitle):
    from sefaria.export import text_is_copyright, make_json, make_text, prepare_merged_text_for_export, prepare_text_for_export, export_merged_csv, export_version_csv

    assert lang in ["en", "he"]
    assert format in ["json", "csv", "txt"]
    merged = versionTitle == "merged"

    index = library.get_index(title)
    version_query = {
        "title": title,
        "language": lang,
        "versionTitle": versionTitle
    }

    if format == "csv" and not merged:
        version = Version().load(version_query)
        assert version, "Can not find version of {} in {}: {}".format(
            title, lang, versionTitle)
        assert not version.is_copyrighted(
        ), "Cowardly refusing to export copyrighted text."
        content = export_version_csv(index, [version])

    elif format == "csv" and merged:
        content = export_merged_csv(index, lang)

    elif format == "json" and not merged:
        version_object = db.texts.find_one(version_query)
        assert version_object, "Can not find version of {} in {}: {}".format(
            title, lang, versionTitle)
        assert not text_is_copyright(
            version_object), "Cowardly refusing to export copyrighted text."
        content = make_json(prepare_text_for_export(version_object))

    elif format == "json" and merged:
        content = make_json(prepare_merged_text_for_export(title, lang=lang))

    elif format == "txt" and not merged:
        version_object = db.texts.find_one(version_query)
        assert version_object, "Can not find version of {} in {}: {}".format(
            title, lang, versionTitle)
        assert not text_is_copyright(
            version_object), "Cowardly refusing to export copyrighted text."
        content = make_text(prepare_text_for_export(version_object))

    elif format == "txt" and merged:
        content = make_text(prepare_merged_text_for_export(title, lang=lang))

    content_types = {
        "json": "application/json; charset=utf-8",
        "csv": "text/csv; charset=utf-8",
        "txt": "text/plain; charset=utf-8"
    }
    response = HttpResponse(content, content_type=content_types[format])
    response["Content-Disposition"] = "attachment"
    return response
Example #3
0
def word_frequency_for_text(title, lang="en"):
    """
    Returns an ordered list of word/count tuples for occurences of words inside the 
    text `title`.
    """
    import string
    from collections import defaultdict
    from sefaria.export import make_text, prepare_merged_text_for_export
    from sefaria.utils.util import strip_tags
    text = make_text(prepare_merged_text_for_export(title, lang=lang))

    text = strip_tags(text)
    text = text.lower()
    text = re.sub(r'[^a-z ]', " ", text)
    text = re.sub(r' +', " ", text)
    text = text.translate(str.maketrans(dict.fromkeys(string.punctuation)))

    count = defaultdict(int)
    words = text.split(" ")
    for word in words:
        count[word] += 1

    counts = sorted(iter(count.items()), key=lambda x: -x[1])

    return counts
Example #4
0
def text_download_api(request, format, title, lang, versionTitle):
    from sefaria.export import text_is_copyright, make_json, make_text, prepare_merged_text_for_export, prepare_text_for_export, export_merged_csv, export_version_csv

    assert lang in ["en", "he"]
    assert format in ["json", "csv", "txt"]
    merged = versionTitle == "merged"

    index = library.get_index(title)
    version_query = {"title": title, "language": lang, "versionTitle": versionTitle}

    if format == "csv" and not merged:
        version = Version().load(version_query)
        assert version, "Can not find version of {} in {}: {}".format(title, lang, versionTitle)
        assert not version.is_copyrighted(), "Cowardly refusing to export copyrighted text."
        content = export_version_csv(index, [version])

    elif format == "csv" and merged:
        content = export_merged_csv(index, lang)

    elif format == "json" and not merged:
        version_object = db.texts.find_one(version_query)
        assert version_object, "Can not find version of {} in {}: {}".format(title, lang, versionTitle)
        assert not text_is_copyright(version_object), "Cowardly refusing to export copyrighted text."
        content = make_json(prepare_text_for_export(version_object))

    elif format == "json" and merged:
        content = make_json(prepare_merged_text_for_export(title, lang=lang))

    elif format == "txt" and not merged:
        version_object = db.texts.find_one(version_query)
        assert version_object, "Can not find version of {} in {}: {}".format(title, lang, versionTitle)
        assert not text_is_copyright(version_object), "Cowardly refusing to export copyrighted text."
        content = make_text(prepare_text_for_export(version_object))

    elif format == "txt" and merged:
        content = make_text(prepare_merged_text_for_export(title, lang=lang))

    content_types = {
        "json": "application/json; charset=utf-8",
        "csv": "text/csv; charset=utf-8",
        "txt": "text/plain; charset=utf-8"
    }
    response = HttpResponse(content, content_type=content_types[format])
    response["Content-Disposition"] = "attachment"
    return response
Example #5
0
def _get_text_version_file(format, title, lang, versionTitle):
    from sefaria.export import text_is_copyright, make_json, make_text, prepare_merged_text_for_export, prepare_text_for_export, export_merged_csv, export_version_csv

    assert lang in ["en", "he"]
    assert format in ["json", "csv", "txt", "plain.txt"]
    merged = versionTitle == "merged"
    index = library.get_index(title)

    if merged:
        if format == "csv" and merged:
            content = export_merged_csv(index, lang)

        elif format == "json" and merged:
            content = make_json(prepare_merged_text_for_export(title, lang=lang))

        elif format == "txt" and merged:
            content = make_text(prepare_merged_text_for_export(title, lang=lang))

        elif format == "plain.txt" and merged:
            content = make_text(prepare_merged_text_for_export(title, lang=lang), strip_html=True)

    else:
        version_query = {"title": title, "language": lang, "versionTitle": versionTitle}

        if format == "csv":
            version = Version().load(version_query)
            assert version, "Can not find version of {} in {}: {}".format(title, lang, versionTitle)
            assert not version.is_copyrighted(), "Cowardly refusing to export copyrighted text."
            content = export_version_csv(index, [version])
        else:
            version_object = db.texts.find_one(version_query)
            assert version_object, "Can not find version of {} in {}: {}".format(title, lang, versionTitle)
            assert not text_is_copyright(version_object), "Cowardly refusing to export copyrighted text."

            if format == "json":
                content = make_json(prepare_text_for_export(version_object))

            elif format == "txt":
                content = make_text(prepare_text_for_export(version_object))

            elif format == "plain.txt":
                content = make_text(prepare_text_for_export(version_object), strip_html=True)

    return content