def clean_file(self, *args, **kwargs): '''Converts the uploaded file to the arff file format, if possible. csv, txt and tab files are parsed as csv files with comma, comma and tab delimiter respectively. arff files are parsed and valid headers are recreated for them. zip files are checked to be valid and contain a single file; the file is extracted and handled as other uncompressed types. ''' input_file = self.cleaned_data.get('file') if not input_file: if self.instance and self.instance.file: return input_file else: raise forms.ValidationError(_("This field is required")) else: if self.instance: self.instance.created = datetime.now() title = self.cleaned_data.get('title', "-") # determine file name and extension name_parts = input_file.name.split(".") extension = name_parts[-1] col_names = [] if extension == "zip": uncompressed_file = self.extract_file(input_file) input_file = uncompressed_file name_parts = input_file.name.split(".") extension = name_parts[-1] if extension == 'csv' or extension == 'txt': reader_file = input_file csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"') elif extension == "tab": reader_file = input_file csv_reader = csv.reader(reader_file, delimiter='\t', quotechar='"') elif extension == "arff": # read arff data section and recreate header, # thus we obtain a valid header tmp = tempfile.NamedTemporaryFile() data_sec = False for row in input_file: if not row.startswith("%"): if data_sec: tmp.write(row) else: row_std = row.strip().lower() if row_std.startswith("@data"): data_sec = True elif row_std.startswith("@attribute"): col_names.append(row.split()[1]); tmp.seek(0) reader_file = tmp csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"') else: raise forms.ValidationError(_('File type is not supported. Please select a tab, csv, txt, arff or zip file.')) # parse file field as a number when possible content = [] for in_row in csv_reader: row = [] for in_col in in_row: col = in_col try: col = int(in_col) except ValueError: try: col = float(in_col) except ValueError: pass row.append(col) content.append(row) reader_file.close() # save content to a temporary file # in order to process by arff function f = tempfile.NamedTemporaryFile() if col_names: arff.dump(f.name, content, names=col_names, relation=title) else: arff.dump(f.name, content, relation=title) f.seek(0) # transfer resulting arff file to memory # in order to return to django buff= StringIO.StringIO(f.read()) f.close() arff_file = InMemoryUploadedFile(buff, 'file', slugify(unicode(title)) + ".arff", None, buff.tell(), None) return arff_file
def clean_file(self, *args, **kwargs): '''Converts the uploaded file to the arff file format, if possible. csv, txt and tab files are parsed as csv files with comma, comma and tab delimiter respectively. arff files are parsed and valid headers are recreated for them. zip files are checked to be valid and contain a single file; the file is extracted and handled as other uncompressed types. ''' input_file = self.cleaned_data.get('file') if not input_file: if self.instance and self.instance.file: return input_file else: raise forms.ValidationError(_("This field is required")) else: if self.instance: self.instance.created = datetime.now() title = self.cleaned_data.get('title', "-") # determine file name and extension name_parts = input_file.name.split(".") extension = name_parts[-1] col_names = [] if extension == "zip": uncompressed_file = self.extract_file(input_file) input_file = uncompressed_file name_parts = input_file.name.split(".") extension = name_parts[-1] if extension == 'csv' or extension == 'txt': reader_file = input_file csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"') elif extension == "tab": reader_file = input_file csv_reader = csv.reader(reader_file, delimiter='\t', quotechar='"') elif extension == "arff": # read arff data section and recreate header, # thus we obtain a valid header tmp = tempfile.NamedTemporaryFile() data_sec = False for row in input_file: if not row.startswith("%"): if data_sec: tmp.write(row) else: row_std = row.strip().lower() if row_std.startswith("@data"): data_sec = True elif row_std.startswith("@attribute"): col_names.append(row.split()[1]) tmp.seek(0) reader_file = tmp csv_reader = csv.reader(reader_file, delimiter=',', quotechar='"') else: raise forms.ValidationError( _('File type is not supported. Please select a tab, csv, txt, arff or zip file.' )) # parse file field as a number when possible content = [] for in_row in csv_reader: row = [] for in_col in in_row: col = in_col try: col = int(in_col) except ValueError: try: col = float(in_col) except ValueError: pass row.append(col) content.append(row) reader_file.close() # save content to a temporary file # in order to process by arff function f = tempfile.NamedTemporaryFile() if col_names: arff.dump(f.name, content, names=col_names, relation=title) else: arff.dump(f.name, content, relation=title) f.seek(0) # transfer resulting arff file to memory # in order to return to django buff = StringIO.StringIO(f.read()) f.close() arff_file = InMemoryUploadedFile(buff, 'file', slugify(unicode(title)) + ".arff", None, buff.tell(), None) return arff_file
def read_classified_data(file_url, x, y, clsCol): f = open(BUILDOUT_DIR + '/var/www' + file_url) result = OrderedDict() minX = None; maxX = None minY = None; maxY = None minCls = None; maxCls = None clsType = None data_sec = False arff_cls = None # class attribute number attributes = [] max_classes = 120 error = None # first read for row in f: if data_sec: # analyse data portion of the file cells = row.rstrip().split(",") if minX is None or float(cells[x]) < minX: minX = float(cells[x]) if maxX is None or float(cells[x]) > maxX: maxX = float(cells[x]) if minY is None or float(cells[y]) < minY: minY = float(cells[y]) if maxY is None or float(cells[y]) > maxY: maxY = float(cells[y]) if clsType != "string": if minCls is None or float(cells[clsCol]) < minCls: try: minCls = int(cells[clsCol]) except ValueError: minCls = float(cells[clsCol]) if maxCls is None or float(cells[clsCol]) > maxCls: try: maxCls = int(cells[clsCol]) except ValueError: maxCls = float(cells[clsCol]) if not (clsType == "string" or clsType == "integer"): continue # try to classify only if the column is string # other types are classified during second read when min/max # are known cls = cells[clsCol] if not cls in result: if len(result.keys()) >= max_classes: error = _('More than <b>{0}</b> classes found in the class ' 'attribute <b>"{1}"</b>. Please select another class ' 'attribute.').format(max_classes, attributes[clsCol][0]) break else: result[cls] = [] result[cls].append([cells[x], cells[y]]) else: # analyse file header row_std = row.strip().lower() if row_std.startswith("@data"): data_sec = True if clsCol is None: if arff_cls is not None: # use arff class attribute, if defined clsCol = arff_cls else: # otherwise, use last column if len(attributes) > 0: clsCol = len(attributes) - 1 if clsCol is not None: clsType = attributes[clsCol][1] if x is None or y is None or clsCol is None: error = _("Please specify columns for rendering, as default choices could not be used.") break elif row_std.startswith("@attribute"): parts = row.split() col_name = parts[1] col_type = parts[2] attr_no = re.findall("^attr(\d+)$", col_name) if attr_no: attributes.append([_("attr{0}").format(attr_no[0]), col_type]) else: attributes.append([col_name, col_type]) attr_idx = len(attributes) - 1 if x is None and col_type != "string": x = attr_idx elif y is None and col_type != "string": y = attr_idx if col_name == "class": # save the number of the class column arff_cls = attr_idx f.close() if not error and clsType != "string" and clsType != "integer": # second read f = open(BUILDOUT_DIR + '/var/www' + file_url) f = strip_arff_header(f) step = 1. * (maxCls - minCls) / max_classes groups = [str(t) + " - " + str(t + step) for t in arange(minCls, maxCls, step)] for row in f: cells = row.rstrip().split(",") val = float(cells[clsCol]) group_no = int(floor((1.0 * (val - minCls) * max_classes) / (maxCls - minCls))) if group_no == len(groups): group_no -= 1 cls = groups[group_no] if not cls in result: result[cls] = [] result[cls].append([cells[x], cells[y]]) f.close() try: result = OrderedDict(sorted(result.items(), key=lambda x: float(unicode(x[0]).split(" - ")[0]))) except ValueError: result = OrderedDict(sorted(result.items(), key=lambda x: slugify(unicode(x[0])))) result = [{"group": cls, "data": data} for cls, data in result.items()] return error, attributes, {"data": result, "minX": minX, "maxX": maxX, "minY": minY, "maxY": maxY, "minCls": minCls, "maxCls": maxCls}, x, y, clsCol