Python Document_Text.select Examples

Programming Language: Python

Namespace/Package Name: osp.corpus.models

Class/Type: Document_Text

Method/Function: select

Examples at hotexamples.com: 7

Python Document_Text.select - 7 examples found. These are the top rated real world Python examples of osp.corpus.models.Document_Text.select extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

select(5)

get(2)

create(1)

term_counts(1)

Example #1

Show file

def syllabus_refinement(in_file, out_file, r, threshold):
    """
    Select the N documents around a given threshold in the syllabus /
    not-syllabus classifier predictions.
    """

    cols = ['path', 'score']
    reader = csv.DictReader(in_file, cols)

    # Gather ordered (path, score) tuples.
    scores = [(r['path'], float(r['score'])) for r in reader]
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Get the index of the document at the threshold.
    center = min(range(len(scores)),
                 key=lambda x: abs(scores[x][1] - threshold))

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    for path, score in scores[center - r:center + r]:

        row = (Document_Text.select(Document_Text.text,
                                    Document.path).join(Document).where(
                                        Document.path == path).naive().first())

        writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})

Example #2

Show file

File: test_ext_text.py Project: project-renard-survey/open-syllabus-project

def test_text_extraction_fails(mock_osp):
    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0

Example #3

Show file

File: test_ext_text.py Project: MichaelEdage/open-syllabus-project

def test_text_extraction_fails(mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0

Example #4

Show file

File: document_index.py Project: ivanistheone/open-syllabus-project

    def es_stream_docs(cls):
        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id=row.document_id,
                body=row.text,
            )

Example #5

Show file

File: document_index.py Project: MichaelEdage/open-syllabus-project

    def es_stream_docs(cls):

        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id = row.document_id,
                body = row.text,
            )

Example #6

Show file

def random(out_file, n):
    """
    Write a CSV with plaintext for N random docs.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document).order_by(
                                      fn.random()).limit(n))

    for row in query_bar(query):

        writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})

Example #7

Show file

def truncated(out_file, frag_len):
    """
    Write a CSV with truncated document texts.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document))

    for row in query_bar(query):

        # Truncate the text.
        fragment = row.text[:frag_len]

        writer.writerow({'id': row.path, 'title': row.path, 'text': fragment})