def download_file(d): document_link = DocumentLink.objects.get(pk=d.pk) doc, created = Document.objects.get_or_create(document_link=document_link) extensions = ( 'doc', 'pdf', 'docx', 'xls', ) document_content = '' if d.url: '''Define the working Directory and saving Path''' wk_dir = os.path.dirname(os.path.realpath('__file__')) save_path = wk_dir + "/docstore/" '''Unshort URLs and get file name''' r = requests.head(d.url, allow_redirects=True) if d.url != r.url: long_url = r.url else: long_url = d.url doc.long_url = long_url local_filename = long_url.split('/')[-1] doc.document_name = local_filename '''Verify if the the URL is containing a file and authorize download''' file_extension = local_filename.split('.')[-1].lower() save_name = str(d.pk) + '.' + file_extension document_path = save_path + save_name is_downloaded = False if file_extension in extensions: if created or (not created and not doc.is_downloaded): doc.url_is_valid = True downloader = DownloadFile(long_url, document_path) try: is_downloaded = downloader.download() doc.is_downloaded = is_downloaded except Exception as e: # print str(e) pass '''Get Text from file and save document''' if is_downloaded: doc.long_url_hash = hashlib.md5(long_url).hexdigest() doc.file_hash = hash_file(document_path) document_content = fulltext.get( save_path + save_name, '< no content >') doc.document_content = document_content if (not created and doc.is_downloaded): '''prepare the updated file storage with the new name \ <update.timestamp.id.extention''' ts = time.time() document_path_update = save_path + "update." + str(ts) + "." + save_name # NOQA: E501 downloader = DownloadFile(long_url, document_path_update) try: is_downloaded = downloader.download() except Exception as e: # print str(e) pass '''hash the downloaded file and it long url''' if is_downloaded: long_url_hash = hashlib.md5(long_url).hexdigest() file_hash = hash_file(document_path_update) '''if file hash or url hash id different, parse the content ' of the file''' if is_downloaded and long_url_hash != '' and ( doc.long_url_hash != long_url_hash or doc.file_hash != file_hash): doc.document_or_long_url_changed = True doc.long_url_hash = long_url_hash doc.file_hash = file_hash document_content = fulltext.get( document_path_update, '< no content >') doc.document_content = document_content else: '''delete the updated file. This file is empty''' os.remove(document_path_update) try: doc.save() except Exception as e: # print str(e) doc.document_content = document_content.decode("latin-1") doc.save()
def download_file(d): document_link = DocumentLink.objects.get(pk=d.pk) doc, created = Document.objects.get_or_create(document_link=document_link) extensions = ( 'doc', 'pdf', 'docx', 'xls', ) document_content = '' if d.url: '''Define the working Directory and saving Path''' wk_dir = os.path.dirname(os.path.realpath('__file__')) save_path = wk_dir + "/docstore/" '''Unshort URLs and get file name''' r = requests.head(d.url, allow_redirects=True) if d.url != r.url: long_url = r.url else: long_url = d.url doc.long_url = long_url local_filename = long_url.split('/')[-1] doc.document_name = local_filename '''Verify if the the URL is containing a file and authorize download''' file_extension = local_filename.split('.')[-1].lower() save_name = str(d.pk) + '.' + file_extension document_path = save_path + save_name is_downloaded = False if file_extension in extensions: if created or (not created and not doc.is_downloaded): doc.url_is_valid = True downloader = DownloadFile(long_url, document_path) try: is_downloaded = downloader.download() doc.is_downloaded = is_downloaded except Exception as e: # print str(e) pass '''Get Text from file and save document''' if is_downloaded: doc.long_url_hash = hashlib.md5(long_url).hexdigest() doc.file_hash = hash_file(document_path) document_content = fulltext.get( save_path + save_name, '< no content >') doc.document_content = document_content if (not created and doc.is_downloaded): '''prepare the updated file storage with the new name \ <update.timestamp.id.extention''' ts = time.time() document_path_update = save_path + \ "update." + str(ts) + "." + save_name downloader = DownloadFile(long_url, document_path_update) try: is_downloaded = downloader.download() except Exception as e: # print str(e) pass '''hash the downloaded file and it long url''' if is_downloaded: long_url_hash = hashlib.md5(long_url).hexdigest() file_hash = hash_file(document_path_update) '''if file hash or url hash id different, parse the content ' of the file''' if is_downloaded and long_url_hash != '' and ( doc.long_url_hash != long_url_hash or doc.file_hash != file_hash): doc.document_or_long_url_changed = True doc.long_url_hash = long_url_hash doc.file_hash = file_hash document_content = fulltext.get( document_path_update, '< no content >') doc.document_content = document_content else: '''delete the updated file. This file is empty''' os.remove(document_path_update) try: doc.save() except Exception as e: # print str(e) doc.document_content = document_content.decode("latin-1") doc.save()