Python Document.get Exemples, osp.corpus.models.document.Document.get Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ext_archive_url.py Projet : overview/osp

def ext_archive_url(doc_id):

    """
    Try to extract an Internet Archive timestamp from the URL.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    match = re.search(
        'web\.archive\.org\/web\/(?P<timestamp>\d+)',
        doc.syllabus.url
    )

    if match:

        date = datetime.strptime(
            match.group('timestamp'),
            date_format
        )

        if date < datetime.now():

            return Document_Date_Archive_Url.create(
                document=doc,
                date=date
            )

Exemple #2

0

Afficher le fichier

Fichier : match_doc.py Projet : overview/osp

def match_doc(id):

    """
    Find an institution with the same base URL as a document.

    Args:
        id (int): A document id.
    """

    doc = Document.get(Document.id==id)

    # Break if no manifest.
    if not doc.syllabus.registered_domain:
        return

    # Form the domain query.
    q = '%'+doc.syllabus.registered_domain+'%'

    inst = (
        Institution
        .select()
        .where(Institution.metadata['Institution_Web_Address'] ** (q))
        .order_by(Institution.id)
        .first()
    )

    if inst:

        Document_Institution.create(
            document=doc.id,
            institution=inst
        )

Exemple #3

0

Afficher le fichier

Fichier : ext_format.py Projet : overview/osp

def ext_format(doc_id):

    """
    Write the libmagic file format.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    return Document_Format.create(
        format=doc.syllabus.libmagic_file_type,
        document=doc
    )

Exemple #4

0

Afficher le fichier

Fichier : ext_text.py Projet : overview/osp

def ext_text(doc_id):

    """
    Write the document as plain text.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    if doc.syllabus.text:

        return Document_Text.create(
            text=doc.syllabus.text,
            document=doc
        )

Exemple #5

0

Afficher le fichier

Fichier : ext_file_metadata.py Projet : overview/osp

def ext_file_metadata(doc_id):

    """
    Try to extract a created date from PDF and DOCX file metadata.

    Args:
        id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)
    date = doc.syllabus.created_date

    if date:

        return Document_Date_File_Metadata.create(
            document=doc,
            date=date
        )