def _upload_file(self, file_path): """ uploads a file to ckan filestore as and returns file metadata related to its existance in ckan param file_path: name of the file with its current location (path) """ source = 'delineate.delineatewatershed._upload_file():' # this code has been implemented based on the code for the upload_handle() method # in storage.py bucket_id = base.config.get('ckan.storage.bucket', 'default') ts = datetime.now().isoformat().split(".")[0] # '2010-07-08T19:56:47' file_name = os.path.basename(file_path).replace(' ', '-') # ueb request.txt -> ueb-request.txt file_key = os.path.join(ts, file_name) label = file_key params = {} params['filename_original'] = os.path.basename(file_path) params['key'] = file_key try: with open(file_path, 'r') as file_obj: ofs = storage.get_ofs() resource_metadata = ofs.put_stream(bucket_id, label, file_obj, params) log.info(source + 'File upload was successful for file: %s' % file_path) except Exception as e: log.error(source + 'Failed to upload file: %s \nException %s' % (file_path, e)) tk.abort(400, _('Failed to upload file: %s') % file_path) return resource_metadata
def get_url_for_file(label): """ Returns the URL for a file given it's label. """ bucket = config.get('ckan.storage.bucket', 'default') ofs = storage.get_ofs() return ofs.get_url(bucket, label).replace("file://", "")
def _upload_file(self, file_path): """ uploads a file to ckan filestore as and returns file metadata related to its existance in ckan param file_path: name of the file with its current location (path) """ source = 'delineate.delineatewatershed._upload_file():' # this code has been implemented based on the code for the upload_handle() method # in storage.py bucket_id = base.config.get('ckan.storage.bucket', 'default') ts = datetime.now().isoformat().split(".")[0] # '2010-07-08T19:56:47' file_name = os.path.basename(file_path).replace( ' ', '-') # ueb request.txt -> ueb-request.txt file_key = os.path.join(ts, file_name) label = file_key params = {} params['filename_original'] = os.path.basename(file_path) params['key'] = file_key try: with open(file_path, 'r') as file_obj: ofs = storage.get_ofs() resource_metadata = ofs.put_stream(bucket_id, label, file_obj, params) log.info(source + 'File upload was successful for file: %s' % file_path) except Exception as e: log.error(source + 'Failed to upload file: %s \nException %s' % (file_path, e)) tk.abort(400, _('Failed to upload file: %s') % file_path) return resource_metadata
def setUp(self): try: self._test_package = toolkit.get_action("package_show")( { "user": self.admin_user().name }, { "id": self._test_package_name }) except: pass if not self._test_package: # Add a package self._test_package = self.add_package(self._test_package_name) # "Upload" shapefile to the package ofs = storage.get_ofs() label = "%s/test_shapefile_wgs84.zip" % datetime.now().isoformat() anything = ofs.put_stream( config.get('ckan.storage.bucket', 'default'), # bucket label, # label open(test_shapefile_path, "r"), # file stream {"key": label} # params ) # Add a resource self._test_package = self.add_resource( self._test_package["id"], { "package_id": self._test_package["id"], "url": "http://localhost:5000/storage/f/%s" % label }) self._test_resource = self._test_package.get("resources", [None])[0]
def setUp(self): try: self._test_package = toolkit.get_action("package_show")( {"user": self.admin_user().name}, {"id": self._test_package_name} ) except: pass if not self._test_package: # Add a package self._test_package = self.add_package(self._test_package_name) # "Upload" shapefile to the package ofs = storage.get_ofs() label = "%s/test_shapefile_wgs84.zip" % datetime.now().isoformat() anything = ofs.put_stream( config.get("ckan.storage.bucket", "default"), # bucket label, # label open(test_shapefile_path, "r"), # file stream {"key": label}, # params ) # Add a resource self._test_package = self.add_resource( self._test_package["id"], {"package_id": self._test_package["id"], "url": "http://localhost:5000/storage/f/%s" % label}, ) self._test_resource = self._test_package.get("resources", [None])[0]
def get_url_for_file(label): # storage_controller = StorageController() resourcename_fullpath = None try: ofs = storage.get_ofs() BUCKET = config.get('ckan.storage.bucket', 'default') resourcename_fullpath = ofs.get_url(BUCKET,label) except: pass return resourcename_fullpath
def retrieve_file_object_from_file_store(file_filestore_path): """ returns a file obj (in read mode) for the provided file in the ckan file store which the caller then can use to read the contents of the file (file_obj.read()) param file_filestore_path : filecreationdatetime/followed by the filename """ bucket_id = base.config.get('ckan.storage.bucket', 'default') ofs = storage.get_ofs() file_obj = ofs.get_stream(bucket_id, file_filestore_path) return file_obj
def _save_ddi_variables_to_csv(self, name, pkg, harvest_object): # JuhoL: Handle codeBook.dataDscr parts, extract data (eg. questionnaire) # variables etc. # Saves <var>...</var> elements to a csv file accessible at: # <ckan_url>/storage/f/2013-11-05T18%3A10%3A19.686858/1049_var.csv # And separately saves <catgry> elements inside <var> to a csv as a resource # for package. # Assumes that dataDscr has not changed. Valid? data_dscr = "ddi_xml.codeBook.dataDscr" try: ofs = storage.get_ofs() except IOError, ioe: log.debug('Unable to save xml variables: {io}'.format(io=ioe)) self.errors.append('Unable to save xml variables: {io}'.format(io=ioe)) return u''
def add_shapefile_resource(self, package_name, filepath=test_shapefile_path): # Add a package p = self.add_package(package_name) # "Upload" shapefile to the package ofs = storage.get_ofs() label = "%s/%s" % (datetime.now().isoformat(), shapefile_name) anything = ofs.put_stream( config.get('ckan.storage.bucket', 'default'), # bucket label, # label open(filepath, "r"), # file stream {"key": label} # params ) # Add a resource package = self.add_resource(p["id"], {"package_id": p["id"], "url": "http://localhost:5000/storage/f/%s" % label}) return package.get("resources", [None])[0]
def handle_upload(request, field, user=None): from ckan.controllers import storage if not isinstance(field.data, cgi.FieldStorage): return None filename, ext = splitext(field.data.filename) filename = strings.slugify(filename) filename = ''.join([filename, ext]) filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename, ts=datetime.now()) ofs = storage.get_ofs() ofs.put_stream(STORAGE_BUCKET, filename, field.data.file, { 'filename-original': field.data.filename, 'uploaded-by': user.name if user else '', }) root = conf['home_url'] if root.startswith('//'): root = root.replace('//', 'https://' if conf['https'] else 'http://', 1) path = urls.get_url(None, 'storage/f', filename) return ''.join([root, path])
def handle_upload(request, field, user=None): from ckan.controllers import storage if not isinstance(field.data, cgi.FieldStorage): return None filename, ext = splitext(field.data.filename) filename = strings.slugify(filename) filename = ''.join([filename, ext]) filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename, ts=datetime.now()) ofs = storage.get_ofs() ofs.put_stream( STORAGE_BUCKET, filename, field.data.file, { 'filename-original': field.data.filename, 'uploaded-by': user.name if user else '', }) root = conf['home_url'] if root.startswith('//'): root = root.replace('//', 'https://' if conf['https'] else 'http://', 1) path = urls.get_url(None, 'storage/f', filename) return ''.join([root, path])
def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata_oai_dc = data['metadata']['oai_dc'] titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces) # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = titles.get('title_0', identifier) #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] esc_identifier = identifier.replace('/','-') pkg = Package.get(esc_identifier) if not pkg: pkg = Package(name=name, title=title, id=esc_identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = titles idx = 0 for s in ('subject', 'type'): for tag in metadata_oai_dc.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://www.yso.fi'): tags = label_list_yso(tagi) extras['tag_source_%i' % idx] = tagi idx += 1 elif tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. #tagi = munge_tag(tagi[:100]) # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. lastidx = 0 for auth in metadata_oai_dc.get('creator', []): extras['organization_%d' % lastidx] = '' extras['author_%d' % lastidx] = auth lastidx += 1 extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces)) extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Causes failure in commit for some reason. #for f in _handle_format(metadata.get('formatNode', []), namespaces): # pprint.pprint(f) # pkg.add_resource(**f) # There may be multiple identifiers (URL, ISBN, ...) in the metadata. id_idx = 0 for ident in metadata_oai_dc.get('identifier', []): extras['identifier_%i' % id_idx] = ident id_idx += 1 # Check that we have a language. lang = metadata_oai_dc.get('language', []) if lang and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] if 'date' in extras: pkg.version = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] # Metadata may have different identifiers, pick link, if exists. for ids in metadata_oai_dc['identifier']: if ids.startswith('http://') or ids.startswith('https://'): pkg.add_resource(ids, name=pkg.title, format='html') # All belong to the main group even if they do not belong to any set. if group: group.add_package_by_name(pkg.name) # The rest. # description below goes to pkg.notes. I think it should not added here. for mdp, metadata in data['metadata'].items(): for key, value in metadata.items(): if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode', 'publisherNode', 'creator', 'contributorNode', 'description', 'identifier', 'language', 'formatNode'): continue extras[key] = ' '.join(value) #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') for mdp, resource in data['package_resource'].items(): ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {}) pkg.add_resource(**(resource)) if harvest_object: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() model.repo.commit() return pkg.id
def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata = data['metadata'] # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = metadata.get('title', identifier)[0] #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] pkg = Package.get(name) if not pkg: pkg = Package(name=name, title=title, id=identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = {} idx = 0 for s in ('subject', 'type',): for tag in metadata.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id ).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. extras.update( _handle_contributor(metadata.get('contributorNode', []), namespaces)) extras.update( _handle_publisher(metadata.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Check that we have a language. lang = metadata.get('language', []) if lang is not None and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] # The rest. # description below goes to pkg.notes. I think it should not added here. for key, value in metadata.items(): if value is None or len(value) == 0 or key in ( 'title', 'description', 'publisherNode', 'contributorNode', 'formatNode', 'identifier', 'source', 'rightsNode' ): continue extras[key] = value[0] #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') if 'date' in extras: pkg.version = extras['date'] extras['modified'] = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] if 'package_resource' in data: try: ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {}) pkg.add_resource(**(data['package_resource'])) except KeyError: pass if harvest_object is not None: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() # Metadata may have different identifiers, pick link, if exists. # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png # "Data" format is used by CKAN to identify unknown resources. # You can use it if you want (default format is "html"). For example: # - http://my.data.com/my-generated-resource?data # - http://my.data.com/my-resource.data available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html'] default_format = 'html' for ids in metadata['identifier']: if ids.startswith('http://') or ids.startswith('https://'): # The end of the URL must be the format, otherwise it will use "html" by default infer_format = default_format for ext in available_formats: if ids.endswith(ext): infer_format = ext pkg.add_resource(ids, name=pkg.title, format=infer_format) # All belong to the main group even if they do not belong to any set. if group is not None: group.add_package_by_name(pkg.name) model.repo.commit() return pkg.id
def read_data(self, id, resource_id): res = Resource.get(resource_id) pkg = Package.get(id) c.pkg_dict = pkg.as_dict() c.package = pkg c.resource = get_action('resource_show')({'model': model}, {'id': resource_id}) label = res.url.split(config.get('ckan.site_url') + '/storage/f/')[-1] label = urllib2.unquote(label) ofs = get_ofs() try: furl = ofs.get_url(BUCKET, label).split('file://')[-1] except FileNotFoundException: h.flash_error(_('Cannot do data mining on remote resource!')) url = h.url_for(controller='package', action='resource_read', id=id, resource_id=resource_id) return redirect(url) wordstats = {} ret = {} if res.format in ('TXT', 'txt'): wdsf, wdspath = tempfile.mkstemp() os.write(wdsf, "%s\nmetadata description title information" % furl) with os.fdopen(wdsf, 'r') as wordfile: preproc = orngText.Preprocess() table = orngText.loadFromListWithCategories(wdspath) data = orngText.bagOfWords(table, preprocessor=preproc) words = orngText.extractWordNGram(data, threshold=10.0, measure='MI') for i in range(len(words)): d = words[i] wordstats = d.get_metas(str) for k, v in wordstats.items(): if v.value > 10.0: ret[unicode(k, 'utf8')] = v.value from operator import itemgetter c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30] os.remove(wdspath) for i in range(len(data)): d = words[i] wordstats = d.get_metas(str) words = [] for k, v in wordstats.items(): words.append(k) model.repo.new_revision() if not 'autoextracted_description' in pkg.extras: pkg.extras['autoextracted_description'] = ' '.join(words) pkg.save() return render('datamining/read.html') elif res.format in ('odt', 'doc', 'xls', 'ods', 'odp', 'ppt', 'doc', 'html'): textfd, textpath = convert_to_text(res, furl) if not textpath: h.flash_error(_('This file could not be mined for any data!')) os.close(textfd) return render('datamining/read.html') else: wdsf, wdspath = tempfile.mkstemp() os.write(wdsf, "%s\nmetadata description title information" % textpath) preproc = orngText.Preprocess() table = orngText.loadFromListWithCategories(wdspath) data = orngText.bagOfWords(table, preprocessor=preproc) words = orngText.extractWordNGram(data, threshold=10.0, measure='MI') for i in range(len(words)): d = words[i] wordstats = d.get_metas(str) for k, v in wordstats.items(): if v.value > 10.0: ret[unicode(k, 'utf8')] = v.value from operator import itemgetter c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30] os.close(textfd) os.close(wdsf) os.remove(wdspath) os.remove(textpath) for i in range(len(data)): d = words[i] wordstats = d.get_metas(str) words = [] for k, v in wordstats.items(): log.debug(k) words.append(substitute_ascii_equivalents(k)) model.repo.new_revision() if not 'autoextracted_description' in pkg.extras: pkg.extras['autoextracted_description'] = ' '.join(words) pkg.save() return render('datamining/read.html') else: h.flash_error(_('This metadata document is not in proper format for data mining!')) url = h.url_for(controller='package', action='resource_read', id=id, resource_id=resource_id) return redirect(url)
def import_stage(self, harvest_object): """Import the metadata received in the fetch stage to a dataset and create groups if ones are defined. Fill in metadata from study and document description. """ try: xml_dict = {} xml_dict["source"] = harvest_object.content udict = json.loads(harvest_object.content) if "url" in udict: f = urllib2.urlopen(udict["url"]).read() ddi_xml = BeautifulSoup(f, "xml") else: self._save_object_error("No url in content!", harvest_object) return False except urllib2.URLError: self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object) return False except etree.XMLSyntaxError: self._save_object_error("Unable to parse XML!", harvest_object) return False model.repo.new_revision() study_descr = ddi_xml.codeBook.stdyDscr document_info = ddi_xml.codeBook.docDscr.citation title = study_descr.citation.titlStmt.titl.string if not title: title = document_info.titlStmt.titl.string name = study_descr.citation.titlStmt.IDNo.string update = True pkg = Package.get(name) if not pkg: pkg = Package(name=name) update = False producer = study_descr.citation.prodStmt.producer if not producer: producer = study_descr.citation.rspStmt.AuthEnty if not producer: producer = study_descr.citation.rspStmt.othId pkg.author = producer.string pkg.maintainer = producer.string if study_descr.citation.distStmt.contact: pkg.maintainer = study_descr.citation.distStmt.contact.string if document_info.titlStmt.IDNo: pkg.id = document_info.titlStmt.IDNo.string keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas")) keywords = list(set(keywords)) for kw in keywords: if kw: vocab = None kw_str = "" if kw.string: kw_str = kw.string if "vocab" in kw.attrs: vocab = kw.attrs.get("vocab", None) if vocab and kw.string: kw_str = vocab + " " + kw.string pkg.add_tag_by_name(munge_tag(kw_str)) if study_descr.stdyInfo.abstract: description_array = study_descr.stdyInfo.abstract("p") else: description_array = study_descr.citation.serStmt.serInfo("p") pkg.notes = "<br />".join([description.string for description in description_array]) pkg.title = title[:100] pkg.url = udict["url"] if not update: ofs = get_ofs() nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") idno = study_descr.citation.titlStmt.IDNo agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string label = "%s/%s.xml" % (nowstr, agencyxml) ofs.put_stream(BUCKET, label, f, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f)) pkg.add_resource( url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title ) metas = {} descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants] for docextra in descendants: if isinstance(docextra, Tag): if docextra: if docextra.name == "p": docextra.name = docextra.parent.name if not docextra.name in metas and docextra.string: metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra) else: if docextra.string: metas[docextra.name] += ( " " + docextra.string if docextra.string else self._collect_attribs(docextra) ) if ddi_xml.codeBook.dataDscr and not update: vars = ddi_xml.codeBook.dataDscr("var") heads = self._get_headers() c_heads = ["ID", "catValu", "labl", "catStat"] f_var = StringIO.StringIO() c_var = StringIO.StringIO() varwriter = csv.DictWriter(f_var, heads) codewriter = csv.DictWriter(c_var, c_heads) heading_row = {} for head in heads: heading_row[head] = head c_heading_row = {} for head in c_heads: c_heading_row[head] = head varwriter.writerow(heading_row) codewriter.writerow(c_heading_row) for var in vars: try: varwriter.writerow(self._construct_csv(var, heads)) codewriter.writerows(self._create_code_rows(var)) except ValueError, e: raise IOError("Failed to import DDI to CSV! %s" % e) f_var.flush() label = "%s/%s_var.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, f_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len) label = "%s/%s_code.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, c_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len) f_var.seek(0) reader = csv.DictReader(f_var) for var in reader: metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]
def extract_text(resource_url, format): """ Attempts to extract plain text contents from the CKAN resource with the given URL. Only local resources are supported at the moment. Non-plain text files are first converted to a plain text representation if possible. :param resource_url: URL to the resource :type resource_url: str :param format: the file format of the resource (practically file name extension) :type format: str :rtype: unicode :raises IOError: if the resource is remote or cannot be read """ ofs = storage.get_ofs() label = resource_url.split(STORAGE_BASE_URL)[-1] label = urllib2.unquote(label) format = format.lower() log.debug("Resource label: %s" % label) original_path = None converted_path = None try: # Get file location original_path = ofs.get_url(BUCKET, label).split('file://')[-1] except storage_exceptions.FileNotFoundException: raise IOError( "Unable to extract text from {u} -- is the resource remote?". format(u=resource_url)) mime_type = magic.Magic(mime=True).from_file(original_path) if mime_type == 'text/plain': tmp_file = False converted_path = original_path else: log.debug("Attempting to extract plain text from {p}".format( p=original_path)) converted_fd, converted_path = convert_file_to_text( original_path, format) if converted_path is not None: tmp_file = True else: log.info( "Extracting plain text from {p} failed; unsupported format?". format(p=original_path)) tmp_file = False if converted_path is not None: log.debug("Reading from %s", converted_path) try: with codecs.open(converted_path, mode='r', encoding='utf-8') as text_file: text = text_file.read() except UnicodeDecodeError: log.debug( "Failed to open file using UTF-8 encoding. Trying to guess encoding." ) try: encoding = magic.Magic( mime_encoding=True).from_file(converted_path) with codecs.open(converted_path, mode='r', encoding=encoding) as text_file: text = text_file.read() except: text = u"" #raise ValidationError({'resources': [[_("Failed to detect file encoding")]]}) log.debug("Resource plain text contents:") log.debug(text) else: text = u"" if tmp_file: os.remove(converted_path) return text