Example #1
0
    def clean_file(self, *args, **kwargs):
        '''Converts the uploaded file to the arff file format, if possible.
        csv, txt and tab files are parsed as csv files with comma, comma and tab delimiter respectively.
        arff files are parsed and valid headers are recreated for them.
        zip files are checked to be valid and contain a single file; 
        the file is extracted and handled as other uncompressed types.
        '''
        input_file = self.cleaned_data.get('file')
        if not input_file:
            if self.instance and self.instance.file:
                return input_file
            else:
                raise forms.ValidationError(_("This field is required"))
        else:
            if self.instance:
                self.instance.created = datetime.now()
        title = self.cleaned_data.get('title', "-")

        # determine file name and extension
        name_parts = input_file.name.split(".")
        extension = name_parts[-1]
        col_names = []

        if extension == "zip":
            uncompressed_file = self.extract_file(input_file)
            input_file = uncompressed_file
            name_parts = input_file.name.split(".")
            extension = name_parts[-1]

        if extension == 'csv' or extension == 'txt':
            reader_file = input_file
            csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"')
        elif extension == "tab":
            reader_file = input_file 
            csv_reader = csv.reader(reader_file, delimiter='\t', quotechar='"')
        elif extension == "arff":
            # read arff data section and recreate header,
            # thus we obtain a valid header
            tmp = tempfile.NamedTemporaryFile()
            data_sec = False
            for row in input_file:
                if not row.startswith("%"):
                    if data_sec:
                        tmp.write(row)
                    else:
                        row_std = row.strip().lower()
                        if row_std.startswith("@data"):
                            data_sec = True
                        elif row_std.startswith("@attribute"):
                            col_names.append(row.split()[1]);
            tmp.seek(0)
            reader_file = tmp
            csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"')
        else:
            raise forms.ValidationError(_('File type is not supported. Please select a tab, csv, txt, arff or zip file.'))

        # parse file field as a number when possible
        content = []
        for in_row in csv_reader:
            row = []
            for in_col in in_row:
                col = in_col
                try:
                    col = int(in_col)
                except ValueError:
                    try:
                        col = float(in_col)
                    except ValueError:
                        pass
                row.append(col)
            content.append(row)

        reader_file.close()

        # save content to a temporary file
        # in order to process by arff function
        f = tempfile.NamedTemporaryFile()
        if col_names:
            arff.dump(f.name, content, names=col_names, relation=title)
        else:
            arff.dump(f.name, content, relation=title)
        f.seek(0)

        # transfer resulting arff file to memory
        # in order to return to django
        buff= StringIO.StringIO(f.read())
        f.close()
        arff_file = InMemoryUploadedFile(buff, 'file', slugify(unicode(title)) + ".arff", None, buff.tell(), None)

        return arff_file
Example #2
0
    def clean_file(self, *args, **kwargs):
        '''Converts the uploaded file to the arff file format, if possible.
        csv, txt and tab files are parsed as csv files with comma, comma and tab delimiter respectively.
        arff files are parsed and valid headers are recreated for them.
        zip files are checked to be valid and contain a single file; 
        the file is extracted and handled as other uncompressed types.
        '''
        input_file = self.cleaned_data.get('file')
        if not input_file:
            if self.instance and self.instance.file:
                return input_file
            else:
                raise forms.ValidationError(_("This field is required"))
        else:
            if self.instance:
                self.instance.created = datetime.now()
        title = self.cleaned_data.get('title', "-")

        # determine file name and extension
        name_parts = input_file.name.split(".")
        extension = name_parts[-1]
        col_names = []

        if extension == "zip":
            uncompressed_file = self.extract_file(input_file)
            input_file = uncompressed_file
            name_parts = input_file.name.split(".")
            extension = name_parts[-1]

        if extension == 'csv' or extension == 'txt':
            reader_file = input_file
            csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"')
        elif extension == "tab":
            reader_file = input_file
            csv_reader = csv.reader(reader_file, delimiter='\t', quotechar='"')
        elif extension == "arff":
            # read arff data section and recreate header,
            # thus we obtain a valid header
            tmp = tempfile.NamedTemporaryFile()
            data_sec = False
            for row in input_file:
                if not row.startswith("%"):
                    if data_sec:
                        tmp.write(row)
                    else:
                        row_std = row.strip().lower()
                        if row_std.startswith("@data"):
                            data_sec = True
                        elif row_std.startswith("@attribute"):
                            col_names.append(row.split()[1])
            tmp.seek(0)
            reader_file = tmp
            csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"')
        else:
            raise forms.ValidationError(
                _('File type is not supported. Please select a tab, csv, txt, arff or zip file.'
                  ))

        # parse file field as a number when possible
        content = []
        for in_row in csv_reader:
            row = []
            for in_col in in_row:
                col = in_col
                try:
                    col = int(in_col)
                except ValueError:
                    try:
                        col = float(in_col)
                    except ValueError:
                        pass
                row.append(col)
            content.append(row)

        reader_file.close()

        # save content to a temporary file
        # in order to process by arff function
        f = tempfile.NamedTemporaryFile()
        if col_names:
            arff.dump(f.name, content, names=col_names, relation=title)
        else:
            arff.dump(f.name, content, relation=title)
        f.seek(0)

        # transfer resulting arff file to memory
        # in order to return to django
        buff = StringIO.StringIO(f.read())
        f.close()
        arff_file = InMemoryUploadedFile(buff, 'file',
                                         slugify(unicode(title)) + ".arff",
                                         None, buff.tell(), None)

        return arff_file
Example #3
0
def read_classified_data(file_url, x, y, clsCol):
    f = open(BUILDOUT_DIR + '/var/www' + file_url)

    result = OrderedDict()
    minX = None; maxX = None
    minY = None; maxY = None
    minCls = None; maxCls = None
    clsType = None
    data_sec = False
    arff_cls = None # class attribute number
    attributes = []
    max_classes = 120
    error = None

    # first read
    for row in f:
        if data_sec:
            # analyse data portion of the file
            cells = row.rstrip().split(",")
            if minX is None or float(cells[x]) < minX:
                minX = float(cells[x])
            if maxX is None or float(cells[x]) > maxX:
                maxX = float(cells[x])
            if minY is None or float(cells[y]) < minY:
                minY = float(cells[y])
            if maxY is None or float(cells[y]) > maxY:
                maxY = float(cells[y])

            if clsType != "string":
                if minCls is None or float(cells[clsCol]) < minCls:
                    try:
                        minCls = int(cells[clsCol])
                    except ValueError:
                        minCls = float(cells[clsCol])
                if maxCls is None or float(cells[clsCol]) > maxCls:
                    try:
                        maxCls = int(cells[clsCol])
                    except ValueError:
                        maxCls = float(cells[clsCol])

            if not (clsType == "string" or clsType == "integer"):
                continue

            # try to classify only if the column is string
            # other types are classified during second read when min/max
            # are known
            cls = cells[clsCol]
            if not cls in result:
                if len(result.keys()) >= max_classes:
                    error = _('More than <b>{0}</b> classes found in the class '
                            'attribute <b>"{1}"</b>. Please select another class '
                            'attribute.').format(max_classes, attributes[clsCol][0])
                    break
                else:
                    result[cls] = []
            result[cls].append([cells[x], cells[y]])
        else:
            # analyse file header
            row_std = row.strip().lower()
            if row_std.startswith("@data"):
                data_sec = True

                if clsCol is None:
                    if arff_cls is not None:
                        # use arff class attribute, if defined
                        clsCol = arff_cls
                    else:
                        # otherwise, use last column
                        if len(attributes) > 0:
                            clsCol = len(attributes) - 1
                if clsCol is not None:
                    clsType = attributes[clsCol][1]

                if x is None or y is None or clsCol is None:
                    error = _("Please specify columns for rendering, as default choices could not be used.")
                    break
            elif row_std.startswith("@attribute"):
                parts = row.split()
                col_name = parts[1]
                col_type = parts[2]
                attr_no = re.findall("^attr(\d+)$", col_name)
                if attr_no:
                    attributes.append([_("attr{0}").format(attr_no[0]), col_type])
                else:
                    attributes.append([col_name, col_type])
                attr_idx = len(attributes) - 1
                if x is None and col_type != "string":
                    x = attr_idx
                elif y is None and col_type != "string":
                    y = attr_idx
                if col_name == "class":
                    # save the number of the class column
                    arff_cls = attr_idx
    f.close()

    if not error and clsType != "string" and clsType != "integer":
        # second read
        f = open(BUILDOUT_DIR + '/var/www' + file_url)
        f = strip_arff_header(f)

        step = 1. * (maxCls - minCls) / max_classes
        groups = [str(t) + " - " + str(t + step) for t in arange(minCls, maxCls, step)]
        for row in f:
            cells = row.rstrip().split(",")
            val = float(cells[clsCol])
            group_no = int(floor((1.0 * (val - minCls) * max_classes) / (maxCls - minCls)))
            if group_no == len(groups):
                group_no -= 1
            cls = groups[group_no]
            if not cls in result:
                result[cls] = []
            result[cls].append([cells[x], cells[y]])
        f.close()

    try:
        result = OrderedDict(sorted(result.items(), key=lambda x: float(unicode(x[0]).split(" - ")[0])))
    except ValueError:
        result = OrderedDict(sorted(result.items(), key=lambda x: slugify(unicode(x[0]))))
    result = [{"group": cls, "data": data} for cls, data in result.items()]
    return error, attributes, {"data": result, "minX": minX, "maxX": maxX, "minY": minY, "maxY": maxY, "minCls": minCls, "maxCls": maxCls}, x, y, clsCol