def ext_archive_url(doc_id): """ Try to extract an Internet Archive timestamp from the URL. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) match = re.search( 'web\.archive\.org\/web\/(?P<timestamp>\d+)', doc.syllabus.url ) if match: date = datetime.strptime( match.group('timestamp'), date_format ) if date < datetime.now(): return Document_Date_Archive_Url.create( document=doc, date=date )
def match_doc(id): """ Find an institution with the same base URL as a document. Args: id (int): A document id. """ doc = Document.get(Document.id==id) # Break if no manifest. if not doc.syllabus.registered_domain: return # Form the domain query. q = '%'+doc.syllabus.registered_domain+'%' inst = ( Institution .select() .where(Institution.metadata['Institution_Web_Address'] ** (q)) .order_by(Institution.id) .first() ) if inst: Document_Institution.create( document=doc.id, institution=inst )
def ext_format(doc_id): """ Write the libmagic file format. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) return Document_Format.create( format=doc.syllabus.libmagic_file_type, document=doc )
def ext_text(doc_id): """ Write the document as plain text. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) if doc.syllabus.text: return Document_Text.create( text=doc.syllabus.text, document=doc )
def ext_file_metadata(doc_id): """ Try to extract a created date from PDF and DOCX file metadata. Args: id (int): The document id. """ doc = Document.get(Document.id==doc_id) date = doc.syllabus.created_date if date: return Document_Date_File_Metadata.create( document=doc, date=date )