def main(self): """ Main method for base ArffConverter class. Several helper methods must be defined by child classes. """ self.create_data_frame() self.collect_comments() self.output_file.write('@RELATION {} \n\n'.format( quote_if_space(self.relation))) output_header = self.convert_header() for line in output_header: self.output_file.write(line) self.output_file.write('\n@DATA\n') for row in self.output_rows(): self.output_file.write(row) self.output_file.close() if self.validate: validator = ArffValidator(input_file=self.input_file, arff_file=self.output_file) validator.validate()
def test_quote_if_space(self): expected_outcomes = { 'abcd': 'abcd', ' defg': '" defg"', 'hijk ': '"hijk "', 'lm no': '"lm no"', } for val in expected_outcomes.keys(): outcome = utils.quote_if_space(val) self.assertEqual(outcome, expected_outcomes[val])
def compare_values(line, arff_line): """ Compares entries in lines from the input & output files and raises a ValidationError on mismatch :param line: line from input file split into a list of entries :param arff_line: line from output file split into a list of entries """ # TODO: How do I keep this from turning into one big switch statement or if, elif nightmare? msg = 'Line mismatch between input:\n{}\nand ARFF output:\n{}'.format( line, arff_line) for i, entry in enumerate(line): print(entry, arff_line[i]) # should match, but doesn't if entry != arff_line[i]: if ' ' in entry and quote_if_space(entry) != arff_line[i]: raise ValidationError(msg) # shouldn't match, but does if entry == arff_line[i]: if ' ' in arff_line[i] and quote_if_space(entry) != arff_line[i]: raise ValidationError(msg) if entry in ARFF_FIELD_MAPS['none'] and arff_line[i] != '?': raise ValidationError(msg)
def test_arff_data(self): self.input_file.next() arff_line = self.output_file.readline() while not arff_line.startswith('@DATA'): arff_line = self.output_file.readline() for csv_line in self.input_file: csv_line = csv_line.split(',') csv_line = [utils.quote_if_space(item) for item in csv_line] arff_line = self.output_file.readline().split(',') if '?' not in arff_line: self.assertEqual(csv_line, arff_line) else: # TODO: Implement special case testing pass
def convert_header(self): """ Converts header from data_frame to arff :return: list of lines """ arff_header = [] for column in self.data_frame.columns: attribute_name = column pd_dtype = str(self.data_frame[attribute_name].dtype) arff_dtype = self.map_data_types(pd_dtype, column) line = '@ATTRIBUTE {} {}\n'.format(quote_if_space(attribute_name), arff_dtype) arff_header.append(line) return arff_header