def validate_against_schema(uploaded_file): """ Check if XML files that are part of a single upload validate against the TrendMiner XML schema. This function uses the `xmllint` tool to check if a given XML file conforms to the TrendMiner XML schema. The schema is defined in `<project-dir>/trendminer.xsd`. For any file that validates against the schema, the xmllint tool returns 0. """ file_type = get_file_ext(uploaded_file.name) if file_type == ".zip" and uploaded_file.folder: for file_name in listdir(get_tmp_path(uploaded_file.folder)): if file_name.endswith(".xml") and not file_name == "om.xml": command = shlex.split( 'xmllint --noout --schema "{0}" "{1}"'.format( SCHEMA_PATH, get_tmp_path(uploaded_file.folder, file_name) ) ) subproc = subprocess.Popen(command) returncode = subproc.wait() if not returncode == 0: raise ValidationError(UploadFormErrors.FILES_SCHEMA_CONFORMITY) elif file_type == ".xml": command = shlex.split( 'xmllint --noout --schema "{0}" "{1}"'.format(SCHEMA_PATH, get_tmp_path(uploaded_file.name)) ) subproc = subprocess.Popen(command) if not subproc.wait() == 0: raise ValidationError(UploadFormErrors.XML_SCHEMA_CONFORMITY)
def validate_xml_well_formedness(uploaded_file): """ Check if XML files that are part of a single upload are well-formed. This function uses the `xmlwf` tool to determine if a given XML file is well-formed. The tool does not use standard return codes for representing the outcome of the check. Instead, if a file is well-formed, it simply outputs nothing. If it's not, xmlwf writes a description of the problem to standard output. """ file_type = get_file_ext(uploaded_file.name) if file_type == ".zip" and uploaded_file.folder: for file_name in listdir(get_tmp_path(uploaded_file.folder)): if file_name.endswith(".xml") and not file_name == "om.xml": command = shlex.split('xmlwf "{}"'.format(get_tmp_path(uploaded_file.folder, file_name))) subproc = subprocess.Popen(command, stdout=subprocess.PIPE) error_msg = subproc.stdout.read() if error_msg: raise ValidationError(UploadFormErrors.FILES_WELLFORMEDNESS) elif file_type == ".xml": command = shlex.split('xmlwf "{}"'.format(get_tmp_path(uploaded_file.name))) subproc = subprocess.Popen(command, stdout=subprocess.PIPE) error_msg = subproc.stdout.read() if error_msg: raise ValidationError(UploadFormErrors.XML_WELLFORMEDNESS)
def validate_zip_integrity(uploaded_file): """ If uploaded file is a .zip archive, check its integrity and the integrity of the files it contains. In case of a corrupted archive the `ZipFile` constructor raises IOError. To check the integrity of the files contained in the archive, the `ZipFile.testzip()` function is used. If the uploaded file appears to be a .zip archive (because its extension is `.zip`), but actually isn't, the `ZipFile` constructor raises `BadZipFile`. Because this case is covered by the MIME type validator, the function does not raise a ValidationError in this case. """ if uploaded_file.name.endswith("zip"): corrupted_file = None try: archive = ZipFile(get_tmp_path(uploaded_file.name)) corrupted_file = archive.testzip() except IOError: raise ValidationError(UploadFormErrors.ZIP_INTEGRITY) except BadZipFile: pass if corrupted_file: raise ValidationError(UploadFormErrors.FILES_INTEGRITY)
def test_get_tmp_path(self): """ Check if `get_tmp_path` function returns correct path for a given file name. The function is expected to return an absolute path starting with the directory specified by the `TMP_PATH` setting in `settings.py`. """ file_path = get_tmp_path(self.file_name) self.assertTrue( os.path.isabs(file_path) and file_path.startswith(self.upload_dir))
def _analyse(data): """ Run TrendMiner on user uploaded data. """ file_type = get_file_ext(data.name) if file_type == '.zip': command = 'perl -I {0} {1}'.format( PERL_PATH, path.join(PERL_PATH, 'om-xml.pl')) subprocess.call( command, cwd=get_tmp_path(data.folder), shell=True) entities = parse_results(data.folder) elif file_type == '.xml': entities = [] return entities
def parse_results(request_id): """ Read TrendMiner results for a specific request from corresponding XML output file and store them in a dictionary. """ result = open(get_tmp_path(request_id, 'om.xml')).read() result_tree = ElementTree.fromstring(result) entities = sorted([ {'attributes': [entity.find('name').text, entity.find('source_title').text, entity.find('ticker_string').text], 'polarity': entity.find('polarity').text, 'polarity_range': range(int(entity.find('polarity').text))} for entity in result_tree]) return entities
def validate_zip_contents(uploaded_file): """ If uploaded file is a .zip archive, check if all of the files it contains are XML files. This function examines the extension of each file in the .zip archive to determine if it is an XML file. As of right now, it does not check MIME types. """ contents = [] if uploaded_file.name.endswith("zip"): try: archive = ZipFile(get_tmp_path(uploaded_file.name)) contents = archive.namelist() except (IOError, BadZipFile): pass if any(not item.endswith("xml") for item in contents): raise ValidationError(UploadFormErrors.ZIP_CONTENTS)
def validate_mime_type(uploaded_file): """ Check MIME type of uploaded file and make sure it corresponds to the file's extension. This function uses the UNIX `file` command with the `--mime-type` option to obtain the MIME type of the uploaded file. It then checks to see if the MIME type corresponds to one of the types appropriate for the file's extension. """ subproc = subprocess.Popen( "file --mime-type {}".format(get_tmp_path(uploaded_file.name)), shell=True, stdout=subprocess.PIPE ) mime_type = subproc.stdout.read().strip().split(": ")[-1] file_extension = get_file_ext(uploaded_file.name) if file_extension == ".zip" and not mime_type in ZIP_MIME_TYPES: raise ValidationError(UploadFormErrors.MIME_TYPE.format(".zip", mime_type)) elif file_extension == ".xml" and not mime_type in XML_MIME_TYPES: raise ValidationError(UploadFormErrors.MIME_TYPE.format(".xml", mime_type))
#!/usr/bin/env python2 import argparse import utils import XMLSerializer as XS from Class import FileDetails TEMP_DIR = utils.get_tmp_path() def main(args): zipfilepath = args.zip if zipfilepath is None: print "pass arguements correctly!" exit(-1) xmlfilepath = args.xmlfile zip_path = zipfilepath if utils.valid_file(zip_path) is not True: print "bad zip" exit(-1) data_for_all_files = [] path_to_extract = utils.random_temp_path(TEMP_DIR) utils.extractor(zip_path, path_to_extract) list_of_all_files = utils.getListOfFiles(path_to_extract) for path_to_file in list_of_all_files: uid = utils.get_uuid() filename = utils.stripfilepath(path_to_file) rel_path = utils.get_relative_path(path_to_file, path_to_extract) md5hash = utils.md5sum(path_to_file)