def __iadd__(self, record: MafRecord) -> 'MafWriter': """Write a MafRecord.""" # set the scheme and write the column names if not already written if not self._scheme: column_names = [str(key) for key in record.keys()] self._scheme = NoRestrictionsScheme(column_names=column_names) self._handle.write( MafRecord.ColumnSeparator.join(self._scheme.column_names()) + "\n") self._set_checker_and_sorter() # validate the record record.validate( validation_stringency=self.validation_stringency, logger=self._logger, reset_errors=True, scheme=self._scheme, ) # either write it directly, or add it to the sorter if self._sorter: self._sorter += record # type: ignore else: self._handle.write(str(record) + "\n") return self
def test_str(self): record = MafRecord() record.add(MafColumnRecord("key1", "value1")) record.add(MafColumnRecord("key2", "value2")) self.assertEqual(len(record.validate()), 0) self.assertEqual(str(record), MafRecord.ColumnSeparator.join(["value1", "value2"]))
def test_delitem_various_columns(self): # The aim is to make sure that when there are columns that have None values, they are removed when the next # column is removed. record = MafRecord() record[0] = MafColumnRecord("key0", "value0", column_index=0) record[1] = MafColumnRecord("key1", "value1", column_index=1) record[4] = MafColumnRecord("key4", "value4", column_index=4) record.validate() self.assertEqual(len(record), 5) self.assertEqual(len(record.validation_errors), 2) # two missing columns: 2 & 3 types = list(set([error.tpe for error in record.validation_errors])) self.assertEqual(len(types), 1) # only one error self.assertEqual(types[0], MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE) del record[1] record.validate() self.assertEqual(len(record), 5) self.assertEqual(len(record.validation_errors), 3) # three missing columns: 1, 2, & 3 types = list(set([error.tpe for error in record.validation_errors])) self.assertEqual(len(types), 1) # only one error self.assertEqual(types[0], MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE) del record[4] record.validate() self.assertEqual(len(record), 1) self.assertEqual(len(record.validation_errors), 0) # all good del record[0] record.validate() self.assertEqual(len(record), 0) self.assertEqual(len(record.validation_errors), 0) # all good
def test_add(self): record = MafRecord() column = MafColumnRecord("key1", "value1", column_index=0) self.assertEqual(record.add(column), record) self.assertEqual(len(record), 1) self.assertEqual(len(record.validate()), 0) self.assertEqual(record[0], column)
def _generate_overlaps(test_input_scheme, line_list, callers): lst = [] for line in line_list: if isinstance(line, list): curr = [] for item in line: curr.append( MafRecord.from_line( item, scheme=test_input_scheme, validation_stringency=ValidationStringency.Strict, ) ) lst.append(curr) elif line: lst.append( [ MafRecord.from_line( line, scheme=test_input_scheme, validation_stringency=ValidationStringency.Strict, ) ] ) else: lst.append([]) return OverlapSet(lst, callers)
def test_from_line_neither_column_names_nor_scheme(self): with self.assertRaises(ValueError): MafRecord.from_line( line=MafRecord.ColumnSeparator.join( ["value1", "value2", "value3"]), validation_stringency=ValidationStringency.Silent, )
def test_iadd(self): record = MafRecord() column = MafColumnRecord("key1", "value1", column_index=0) previous_record = record record += column self.assertEqual(record, previous_record) self.assertEqual(len(record), 1) self.assertEqual(len(record.validate()), 0) self.assertEqual(record[0], column)
def test_setitem_in_order_and_delitem_in_reverse(self): record = MafRecord() # add the columns in order for i in range(0, 10): record["key%d" % i] = MafColumnRecord("key%d" % i, "value%d" % i) self.assertEqual(len(record), 10) self.assertEqual(len(record.validate()), 0) self.assertListEqual(list(record.keys()), ["key%d" % i for i in range(0, 10)]) self.assertListEqual( [column.value for column in record.values()], ["value%d" % i for i in range(0, 10)], ) # delete in the reverse order keys = list(record.keys()) for i in range(9, -1, -1): column = record[i] self.assertIn(column, record) if i % 2 == 0: del record[i] else: del record[keys[i]] self.assertFalse(column in record) self.assertEqual(len(record), i) self.assertEqual(len(record.validate()), 0) self.assertListEqual(list(record.keys()), ["key%d" % j for j in range(0, i)]) self.assertListEqual( [column.value for column in record.values()], ["value%d" % j for j in range(0, i)], )
def test_setitem_mismatching_column_indexes(self): record = MafRecord() column_old = MafColumnRecord("key1", "value1", column_index=0) record[column_old.key] = column_old column_new = MafColumnRecord("key1", "value1", column_index=1) with self.assertRaises(ValueError): record[column_old] = column_new
def test_access_with_different_types(self): record = MafRecord() column = MafColumnRecord(key="key1", value="value2", column_index=0) # via int record[column.column_index] = column self.assertIn(column.column_index, record) self.assertEqual(record[column.column_index], column) del record[column.column_index] self.assertEqual(len(record), 0) self.assertFalse(column.column_index in record) # via MafColumnRecord record[column] = column self.assertIn(column, record) self.assertEqual(record[column], column) del record[column] self.assertEqual(len(record), 0) self.assertFalse(column in record) # via str record[column.key] = column self.assertIn(column.key, record) self.assertEqual(record[column.key], column) del record[column.key] self.assertEqual(len(record), 0) self.assertFalse(column.key in record)
def decode(self, data, start, length): """Decodes the data and re-parses the text, returning a MafRecord""" line = data[start:(start + length)].decode('utf-8') return MafRecord.from_line( line=line, column_names=self._column_names, scheme=self._scheme, validation_stringency=self.validation_stringency)
def test_setitem_with_wrong_type(self): record = MafRecord() with self.assertRaises(TypeError): record[42.42] = MafColumnRecord("key", "value") with self.assertRaises(TypeError): record["key2"] = 42.42 with self.assertRaises(TypeError): record[None] = MafColumnRecord("key", "value")
def test_getitem_with_missing_key(self): record = MafRecord() with self.assertRaises(KeyError): column = record[0] with self.assertRaises(KeyError): column = record[MafColumnRecord(key="key1", value="value2")] with self.assertRaises(KeyError): column = record["key"]
def test_getitem_with_column_index_out_of_range(self): record = MafRecord() with self.assertRaises(KeyError): column = record[0] record[0] = MafColumnRecord(key="key1", value="value2") with self.assertRaises(KeyError): column = record[-1] with self.assertRaises(KeyError): column = record[2]
def decode(self, data: bytes, start: int, length: int) -> MafRecord: """Decodes the data and re-parses the text, returning a MafRecord""" end = start + length line = data[start:end].decode('utf-8') return MafRecord.from_line( line=line, column_names=self._column_names, scheme=self._scheme, validation_stringency=self.validation_stringency, )
def test_from_line_valid(self): column_names = ["key1", "key2", "key3"] values = ["value1", "value2", "value3"] record = MafRecord.from_line( line=MafRecord.ColumnSeparator.join(values), column_names=column_names, validation_stringency=ValidationStringency.Silent, ) self.assertEqual(len(record), 3) self.assertEqual(len(record.validation_errors), 0) self.assertListEqual(list(record.keys()), column_names) self.assertListEqual(record.column_values(), values)
def test_from_line_mismatch_number_of_columns(self): record = MafRecord.from_line( line=MafRecord.ColumnSeparator.join(["value1", "value2", "value3"]), column_names=["key1", "key2"], validation_stringency=ValidationStringency.Silent, ) self.assertEqual(len(record), 0) self.assertEqual(len(record.validation_errors), 1) self.assertEqual( record.validation_errors[0].tpe, MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS, )
def test_with_scheme_diff_num_columns(self): scheme = TestMafRecord.TestScheme() record = MafRecord() column_names = scheme.column_names() column_names = column_names[:len(column_names) - 1] column_values = ["string1", "3.14"] for column_index, column_name in enumerate(column_names): column_class = scheme.column_class(column_name) column = column_class.build( name=column_name, value=column_values[column_index], column_index=column_index, ) record[column_name] = column record.validate(scheme=scheme) self.assertEqual(len(scheme.column_names()), 3) self.assertEqual(len(record), 2) self.assertEqual(len(record.validation_errors), 1) self.assertListEqual( [e.tpe for e in record.validation_errors], [MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS], )
def test_record_validation_error(self): scheme = TestMafWriter.TestScheme() fd, path = tempfile.mkstemp() # Create the header header_lines = (MafHeader.scheme_header_lines(scheme) + ["#key1 value1", "#key2 value2"] + ["str1\tNone\tstr2"]) header = MafHeader.from_lines( lines=header_lines, validation_stringency=ValidationStringency.Silent) # Create the record values = ["string2", "error", "string1"] record_line = MafRecord.ColumnSeparator.join(values) record = MafRecord.from_line( line=record_line, scheme=scheme, line_number=1, validation_stringency=ValidationStringency.Silent, ) # Write the header, and the record twice with captured_output() as (stdout, stderr): writer = MafWriter.from_path( header=header, validation_stringency=ValidationStringency.Lenient, path=path, ) writer += record writer.write(record) writer.close() stdout = stdout.getvalue().rstrip('\r\n').split("\n") stderr = stderr.getvalue().rstrip('\r\n').split("\n") self.assertListEqual(stdout, ['']) # The errors that should be written stderr errors = [ "HEADER_UNSUPPORTED_VERSION", "HEADER_UNSUPPORTED_ANNOTATION_SPEC", "RECORD_COLUMN_WITH_NO_VALUE", "RECORD_COLUMN_WITH_NO_VALUE", ] self.assertListEqualAndIn(errors, stderr) # The second column should be None err_record_line = record_line.replace("error", "None") self.assertListEqual(read_lines(path), header_lines + [err_record_line, err_record_line])
def test_setitem_and_delitem_every_other(self): record = MafRecord() # add every other for i in range(0, 10, 2): record["key%d" % i] = MafColumnRecord("key%d" % i, "value%d" % i, column_index=i) self.assertEqual(len(record), 9) validation_errors = record.validate() self.assertEqual(len(validation_errors), 4) self.assertListEqual( [error.tpe for error in validation_errors], [MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE] * 4, # records 1, 3, 5, 7 ) self.assertListEqual( list(record.keys()), ["key%d" % i if (i % 2 == 0) else None for i in range(0, 9)], ) self.assertListEqual( record.column_values(), ["value%d" % i if (i % 2 == 0) else None for i in range(0, 9)], )
def __next__(self): """Gets the next ``MafRecord``. Raises a ``StopIteration`` when no more records can be read.""" if self.__next_line is None: raise StopIteration record = MafRecord.from_line( line=self.__next_line, scheme=self.__scheme, # always use the scheme line_number=self.__line_number, validation_stringency=self.validation_stringency ) for error in record.validation_errors: self.validation_errors.append(error) self.__next_line__() return record
def test_from_line_with_scheme_failed_to_build(self): scheme = TestMafRecord.TestScheme() values = ["string1", "string2", "string3"] record = MafRecord.from_line( line=MafRecord.ColumnSeparator.join(values), scheme=scheme, validation_stringency=ValidationStringency.Silent, ) self.assertEqual(len(record), 3) self.assertListEqual(record.column_values(), ["string1", None, "string3"]) self.assertEqual(len(record.validation_errors), 2) self.assertListEqual( [e.tpe for e in record.validation_errors], [ MafValidationErrorType.RECORD_INVALID_COLUMN_VALUE, MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE, ], )
def test_with_fasta_index(self): # change the order of chromosomes! fasta_index_lines = [ "chr13\t114364328\t2106716512\t70\t71", "chr1\t248956422\t112\t70\t71" ] fd, fn = tmp_file(lines=fasta_index_lines) lines, header, records = self.read_test_maf() subcommand_args = [ "--version", GdcV1_0_0_PublicScheme.version(), "--annotation", GdcV1_0_0_PublicScheme.annotation_spec() ] out_lines, stdout, stderr = run_main(subcommand="sort", lines=lines, subcommand_args=subcommand_args) # Check that we have the same # of records out_records = [line for line in out_lines \ if not line.startswith("#") and not line.startswith("Hugo_Symbol")] self.assertEqual(len(out_records), len(records)) # Check that we added the sort pragma sortOrderLine = "%s%s %s" % (MafHeader.HeaderLineStartSymbol, MafHeader.SortOrderKey, BarcodesAndCoordinate.name()) self.assertTrue(sortOrderLine in out_lines) scheme = find_scheme( version=GdcV1_0_0_PublicScheme.version(), annotation=GdcV1_0_0_PublicScheme.annotation_spec()) # we should see chr13 before chr1 self.assertEqual(len(out_lines) - 1, len(lines)) # added the pragma found_chr1 = False for line in out_lines: if line.startswith(MafHeader.HeaderLineStartSymbol): continue record = MafRecord.from_line(line=line, scheme=scheme) self.assertFalse(record["Chromosome"] == "chr13" and found_chr1) found_chr1 = record["Chromosome"] == "chr1" fd.close() os.remove(fn)
def test_from_line_with_scheme_invalid_column_name(self): scheme = TestMafRecord.TestScheme() column_names = ["no-name", "float", "str2"] values = ["string1", "3.14", "string3"] record = MafRecord.from_line( line=MafRecord.ColumnSeparator.join(values), column_names=column_names, scheme=scheme, validation_stringency=ValidationStringency.Silent, ) self.assertEqual(len(record), 3) self.assertListEqual(record.column_values(), [None, 3.14, "string3"]) self.assertEqual(len(record.validation_errors), 2) self.assertListEqual( [e.tpe for e in record.validation_errors], [ MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES, MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE, ], )
def add_records(self): scheme = TestMafWriter.TestScheme() fd, path = tempfile.mkstemp() header_lines = MafHeader.scheme_header_lines(scheme) + [ "#key1 value1", "#key2 value2", ] header = MafHeader.from_lines(lines=header_lines) writer = MafWriter.from_path(header=header, path=path) values = ["string2", "3.14", "string1"] record_line = MafRecord.ColumnSeparator.join(values) record = MafRecord.from_line(line=record_line, scheme=scheme, line_number=1) writer += record writer.write(record) writer.close() self.assertListEqual(read_lines(path), header_lines + [record_line, record_line])
def test_from_line_with_scheme_column_out_of_order(self): scheme = TestMafRecord.TestScheme() column_names = ["str2", "float", "str1"] values = ["string2", "3.14", "string1"] record = MafRecord.from_line( line=MafRecord.ColumnSeparator.join(values), column_names=column_names, scheme=scheme, validation_stringency=ValidationStringency.Silent, ) self.assertEqual(len(record), 2) self.assertListEqual(record.column_values(), [None, 3.14]) self.assertEqual(len(record.validation_errors), 3) self.assertListEqual( [e.tpe for e in record.validation_errors], [ MafValidationErrorType.RECORD_COLUMN_OUT_OF_ORDER, MafValidationErrorType.RECORD_COLUMN_OUT_OF_ORDER, MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE, ], )
def _get_empty_maf_record(line_number=0, stringency=ValidationStringency.Strict): return MafRecord(line_number=line_number, validation_stringency=stringency)
def __init__(self, record: MafRecord, contigs: List[str]): self.tumor_barcode = record.value("Tumor_Sample_Barcode") self.normal_barcode = record.value("Matched_Norm_Sample_Barcode") super(_BarcodesAndCoordinateKey, self).__init__(record, contigs)
def init_empty_maf_record(line_number=None, stringency=ValidationStringency.Strict): """ Initialize an empty maf record. """ return MafRecord(line_number=line_number, validation_stringency=stringency)
def encode(self, record: MafRecord) -> bytearray: """Encodes a MafRecord""" if not self._column_names: self._column_names = record.keys() # type: ignore return bytearray(source=str(record), encoding='utf-8') # type: ignore