Ejemplo n.º 1
0
def ext_archive_url(doc_id):

    """
    Try to extract an Internet Archive timestamp from the URL.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    match = re.search(
        'web\.archive\.org\/web\/(?P<timestamp>\d+)',
        doc.syllabus.url
    )

    if match:

        date = datetime.strptime(
            match.group('timestamp'),
            date_format
        )

        if date < datetime.now():

            return Document_Date_Archive_Url.create(
                document=doc,
                date=date
            )
Ejemplo n.º 2
0
def match_doc(id):

    """
    Find an institution with the same base URL as a document.

    Args:
        id (int): A document id.
    """

    doc = Document.get(Document.id==id)

    # Break if no manifest.
    if not doc.syllabus.registered_domain:
        return

    # Form the domain query.
    q = '%'+doc.syllabus.registered_domain+'%'

    inst = (
        Institution
        .select()
        .where(Institution.metadata['Institution_Web_Address'] ** (q))
        .order_by(Institution.id)
        .first()
    )

    if inst:

        Document_Institution.create(
            document=doc.id,
            institution=inst
        )
Ejemplo n.º 3
0
def ext_format(doc_id):

    """
    Write the libmagic file format.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    return Document_Format.create(
        format=doc.syllabus.libmagic_file_type,
        document=doc
    )
Ejemplo n.º 4
0
def ext_text(doc_id):

    """
    Write the document as plain text.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    if doc.syllabus.text:

        return Document_Text.create(
            text=doc.syllabus.text,
            document=doc
        )
Ejemplo n.º 5
0
def ext_file_metadata(doc_id):

    """
    Try to extract a created date from PDF and DOCX file metadata.

    Args:
        id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)
    date = doc.syllabus.created_date

    if date:

        return Document_Date_File_Metadata.create(
            document=doc,
            date=date
        )