Example #1
0
def validate_against_schema(uploaded_file):
    """
    Check if XML files that are part of a single upload validate
    against the TrendMiner XML schema.

    This function uses the `xmllint` tool to check if a given XML file
    conforms to the TrendMiner XML schema. The schema is defined in
    `<project-dir>/trendminer.xsd`. For any file that validates
    against the schema, the xmllint tool returns 0.
    """
    file_type = get_file_ext(uploaded_file.name)
    if file_type == ".zip" and uploaded_file.folder:
        for file_name in listdir(get_tmp_path(uploaded_file.folder)):
            if file_name.endswith(".xml") and not file_name == "om.xml":
                command = shlex.split(
                    'xmllint --noout --schema "{0}" "{1}"'.format(
                        SCHEMA_PATH, get_tmp_path(uploaded_file.folder, file_name)
                    )
                )
                subproc = subprocess.Popen(command)
                returncode = subproc.wait()
                if not returncode == 0:
                    raise ValidationError(UploadFormErrors.FILES_SCHEMA_CONFORMITY)
    elif file_type == ".xml":
        command = shlex.split(
            'xmllint --noout --schema "{0}" "{1}"'.format(SCHEMA_PATH, get_tmp_path(uploaded_file.name))
        )
        subproc = subprocess.Popen(command)
        if not subproc.wait() == 0:
            raise ValidationError(UploadFormErrors.XML_SCHEMA_CONFORMITY)
Example #2
0
def validate_xml_well_formedness(uploaded_file):
    """
    Check if XML files that are part of a single upload are
    well-formed.

    This function uses the `xmlwf` tool to determine if a given XML
    file is well-formed. The tool does not use standard return codes
    for representing the outcome of the check. Instead, if a file is
    well-formed, it simply outputs nothing. If it's not, xmlwf writes
    a description of the problem to standard output.
    """
    file_type = get_file_ext(uploaded_file.name)
    if file_type == ".zip" and uploaded_file.folder:
        for file_name in listdir(get_tmp_path(uploaded_file.folder)):
            if file_name.endswith(".xml") and not file_name == "om.xml":
                command = shlex.split('xmlwf "{}"'.format(get_tmp_path(uploaded_file.folder, file_name)))
                subproc = subprocess.Popen(command, stdout=subprocess.PIPE)
                error_msg = subproc.stdout.read()
                if error_msg:
                    raise ValidationError(UploadFormErrors.FILES_WELLFORMEDNESS)
    elif file_type == ".xml":
        command = shlex.split('xmlwf "{}"'.format(get_tmp_path(uploaded_file.name)))
        subproc = subprocess.Popen(command, stdout=subprocess.PIPE)
        error_msg = subproc.stdout.read()
        if error_msg:
            raise ValidationError(UploadFormErrors.XML_WELLFORMEDNESS)
Example #3
0
def validate_zip_integrity(uploaded_file):
    """
    If uploaded file is a .zip archive, check its integrity and the
    integrity of the files it contains.

    In case of a corrupted archive the `ZipFile` constructor raises
    IOError. To check the integrity of the files contained in the
    archive, the `ZipFile.testzip()` function is used.

    If the uploaded file appears to be a .zip archive (because its
    extension is `.zip`), but actually isn't, the `ZipFile`
    constructor raises `BadZipFile`. Because this case is covered by
    the MIME type validator, the function does not raise a
    ValidationError in this case.
    """
    if uploaded_file.name.endswith("zip"):
        corrupted_file = None
        try:
            archive = ZipFile(get_tmp_path(uploaded_file.name))
            corrupted_file = archive.testzip()
        except IOError:
            raise ValidationError(UploadFormErrors.ZIP_INTEGRITY)
        except BadZipFile:
            pass
        if corrupted_file:
            raise ValidationError(UploadFormErrors.FILES_INTEGRITY)
Example #4
0
    def test_get_tmp_path(self):
        """
        Check if `get_tmp_path` function returns correct path for a
        given file name.

        The function is expected to return an absolute path starting
        with the directory specified by the `TMP_PATH` setting in
        `settings.py`.
        """
        file_path = get_tmp_path(self.file_name)
        self.assertTrue(
            os.path.isabs(file_path) and
            file_path.startswith(self.upload_dir))
Example #5
0
def _analyse(data):
    """
    Run TrendMiner on user uploaded data.
    """
    file_type = get_file_ext(data.name)
    if file_type == '.zip':
        command = 'perl -I {0} {1}'.format(
            PERL_PATH, path.join(PERL_PATH, 'om-xml.pl'))
        subprocess.call(
            command, cwd=get_tmp_path(data.folder), shell=True)
        entities = parse_results(data.folder)
    elif file_type == '.xml':
        entities = []
    return entities
Example #6
0
def parse_results(request_id):
    """
    Read TrendMiner results for a specific request from corresponding
    XML output file and store them in a dictionary.
    """
    result = open(get_tmp_path(request_id, 'om.xml')).read()
    result_tree = ElementTree.fromstring(result)
    entities = sorted([
            {'attributes': [entity.find('name').text,
                            entity.find('source_title').text,
                            entity.find('ticker_string').text],
             'polarity': entity.find('polarity').text,
             'polarity_range': range(int(entity.find('polarity').text))}
            for entity in result_tree])
    return entities
Example #7
0
def validate_zip_contents(uploaded_file):
    """
    If uploaded file is a .zip archive, check if all of the files it
    contains are XML files.

    This function examines the extension of each file in the .zip
    archive to determine if it is an XML file. As of right now, it
    does not check MIME types.
    """
    contents = []
    if uploaded_file.name.endswith("zip"):
        try:
            archive = ZipFile(get_tmp_path(uploaded_file.name))
            contents = archive.namelist()
        except (IOError, BadZipFile):
            pass
        if any(not item.endswith("xml") for item in contents):
            raise ValidationError(UploadFormErrors.ZIP_CONTENTS)
Example #8
0
def validate_mime_type(uploaded_file):
    """
    Check MIME type of uploaded file and make sure it corresponds to
    the file's extension.

    This function uses the UNIX `file` command with the `--mime-type`
    option to obtain the MIME type of the uploaded file. It then
    checks to see if the MIME type corresponds to one of the types
    appropriate for the file's extension.
    """
    subproc = subprocess.Popen(
        "file --mime-type {}".format(get_tmp_path(uploaded_file.name)), shell=True, stdout=subprocess.PIPE
    )
    mime_type = subproc.stdout.read().strip().split(": ")[-1]
    file_extension = get_file_ext(uploaded_file.name)
    if file_extension == ".zip" and not mime_type in ZIP_MIME_TYPES:
        raise ValidationError(UploadFormErrors.MIME_TYPE.format(".zip", mime_type))
    elif file_extension == ".xml" and not mime_type in XML_MIME_TYPES:
        raise ValidationError(UploadFormErrors.MIME_TYPE.format(".xml", mime_type))
Example #9
0
#!/usr/bin/env python2

import argparse
import utils
import XMLSerializer as XS

from Class import FileDetails


TEMP_DIR = utils.get_tmp_path()

def main(args):
    zipfilepath = args.zip
    if zipfilepath is None:
        print "pass arguements correctly!"
        exit(-1)
    xmlfilepath = args.xmlfile
    zip_path = zipfilepath
    if utils.valid_file(zip_path) is not True:
        print "bad zip"
        exit(-1)
    data_for_all_files = []
    path_to_extract = utils.random_temp_path(TEMP_DIR)
    utils.extractor(zip_path, path_to_extract)
    list_of_all_files = utils.getListOfFiles(path_to_extract)
    
    for path_to_file in list_of_all_files:
        uid = utils.get_uuid()
        filename = utils.stripfilepath(path_to_file)
        rel_path = utils.get_relative_path(path_to_file, path_to_extract)
        md5hash = utils.md5sum(path_to_file)