예제 #1
0
파일: tasks.py 프로젝트: IATI/iati.cloud
def download_file(d):
    document_link = DocumentLink.objects.get(pk=d.pk)
    doc, created = Document.objects.get_or_create(document_link=document_link)
    extensions = (
        'doc',
        'pdf',
        'docx',
        'xls',
    )
    document_content = ''

    if d.url:
        '''Define the working Directory and saving Path'''
        wk_dir = os.path.dirname(os.path.realpath('__file__'))
        save_path = wk_dir + "/docstore/"

        '''Unshort URLs and get file name'''
        r = requests.head(d.url, allow_redirects=True)
        if d.url != r.url:
            long_url = r.url
        else:
            long_url = d.url
        doc.long_url = long_url
        local_filename = long_url.split('/')[-1]
        doc.document_name = local_filename

        '''Verify if the the URL is containing a file and authorize download'''
        file_extension = local_filename.split('.')[-1].lower()
        save_name = str(d.pk) + '.' + file_extension
        document_path = save_path + save_name
        is_downloaded = False

        if file_extension in extensions:
            if created or (not created and not doc.is_downloaded):
                doc.url_is_valid = True
                downloader = DownloadFile(long_url, document_path)
                try:
                    is_downloaded = downloader.download()
                    doc.is_downloaded = is_downloaded
                except Exception as e:
                    # print str(e)
                    pass

                '''Get Text from file and save document'''
                if is_downloaded:
                    doc.long_url_hash = hashlib.md5(long_url).hexdigest()
                    doc.file_hash = hash_file(document_path)
                    document_content = fulltext.get(
                        save_path + save_name, '< no content >')
                    doc.document_content = document_content

            if (not created and doc.is_downloaded):
                '''prepare the updated file storage with the new name \
                        <update.timestamp.id.extention'''
                ts = time.time()
                document_path_update = save_path + "update." + str(ts) + "." + save_name  # NOQA: E501
                downloader = DownloadFile(long_url, document_path_update)
                try:
                    is_downloaded = downloader.download()
                except Exception as e:
                    # print str(e)
                    pass
                '''hash the downloaded file and it long url'''
                if is_downloaded:
                    long_url_hash = hashlib.md5(long_url).hexdigest()
                    file_hash = hash_file(document_path_update)
                '''if file hash or url hash id different, parse the content '
                of the file'''
                if is_downloaded and long_url_hash != '' and (
                        doc.long_url_hash != long_url_hash
                        or doc.file_hash != file_hash):
                    doc.document_or_long_url_changed = True
                    doc.long_url_hash = long_url_hash
                    doc.file_hash = file_hash
                    document_content = fulltext.get(
                        document_path_update, '< no content >')
                    doc.document_content = document_content
                else:
                    '''delete the updated file. This file is empty'''
                    os.remove(document_path_update)
    try:
        doc.save()
    except Exception as e:
        # print str(e)
        doc.document_content = document_content.decode("latin-1")
        doc.save()
예제 #2
0
def download_file(d):
    document_link = DocumentLink.objects.get(pk=d.pk)
    doc, created = Document.objects.get_or_create(document_link=document_link)
    extensions = (
        'doc',
        'pdf',
        'docx',
        'xls',
    )
    document_content = ''

    if d.url:
        '''Define the working Directory and saving Path'''
        wk_dir = os.path.dirname(os.path.realpath('__file__'))
        save_path = wk_dir + "/docstore/"

        '''Unshort URLs and get file name'''
        r = requests.head(d.url, allow_redirects=True)
        if d.url != r.url:
            long_url = r.url
        else:
            long_url = d.url
        doc.long_url = long_url
        local_filename = long_url.split('/')[-1]
        doc.document_name = local_filename

        '''Verify if the the URL is containing a file and authorize download'''
        file_extension = local_filename.split('.')[-1].lower()
        save_name = str(d.pk) + '.' + file_extension
        document_path = save_path + save_name
        is_downloaded = False

        if file_extension in extensions:
            if created or (not created and not doc.is_downloaded):
                doc.url_is_valid = True
                downloader = DownloadFile(long_url, document_path)
                try:
                    is_downloaded = downloader.download()
                    doc.is_downloaded = is_downloaded
                except Exception as e:
                    # print str(e)
                    pass

                '''Get Text from file and save document'''
                if is_downloaded:
                    doc.long_url_hash = hashlib.md5(long_url).hexdigest()
                    doc.file_hash = hash_file(document_path)
                    document_content = fulltext.get(
                        save_path + save_name, '< no content >')
                    doc.document_content = document_content

            if (not created and doc.is_downloaded):
                '''prepare the updated file storage with the new name \
                        <update.timestamp.id.extention'''
                ts = time.time()
                document_path_update = save_path + \
                    "update." + str(ts) + "." + save_name
                downloader = DownloadFile(long_url, document_path_update)
                try:
                    is_downloaded = downloader.download()
                except Exception as e:
                    # print str(e)
                    pass
                '''hash the downloaded file and it long url'''
                if is_downloaded:
                    long_url_hash = hashlib.md5(long_url).hexdigest()
                    file_hash = hash_file(document_path_update)
                '''if file hash or url hash id different, parse the content '
                of the file'''
                if is_downloaded and long_url_hash != '' and (
                        doc.long_url_hash != long_url_hash
                        or doc.file_hash != file_hash):
                    doc.document_or_long_url_changed = True
                    doc.long_url_hash = long_url_hash
                    doc.file_hash = file_hash
                    document_content = fulltext.get(
                        document_path_update, '< no content >')
                    doc.document_content = document_content
                else:
                    '''delete the updated file. This file is empty'''
                    os.remove(document_path_update)
    try:
        doc.save()
    except Exception as e:
        # print str(e)
        doc.document_content = document_content.decode("latin-1")
        doc.save()