def read_source_file_data(self, source_file) -> Sequence[Dict[str, Any]]:
     file_format = self.sources_config.file_format.get(source_file, None)\
         if self.sources_config.file_format else None
     if file_format is not None:
         reader = TabularFileReader(path.join(self.input_dir, source_file), file_format.delimiter)
     else:
         reader = TabularFileReader(path.join(self.input_dir, source_file))
     source_file_data = reader.read_data()
     if self.sources_config.codebooks is not None:
         codebook_filename = self.sources_config.codebooks.get(source_file, None)
         if codebook_filename is not None:
             codebook_mapper = CodeBookMapper(path.join(self.input_dir, codebook_filename))
             source_file_data = codebook_mapper.apply(source_file_data)
     return source_file_data
Beispiel #2
0
    def read_data(self, filename: str) -> Optional[Sequence[NGS]]:
        """ Reads .txt file.
        Sample_id should be specified in the header.
        Assumes that all column names except Gene Symbol, Gene ID, Locus ID and Cytoband
        are sample identifiers.

        :param filename: name of the input file
        :return: Sequence of NGS objects
        """
        data = TabularFileReader(os.path.join(self.input_dir,
                                              filename)).read_data()
        biosource_biomaterial_dict = dict()
        if data:
            sample_id_col_num = 0
            for col_value in data[0]:
                if col_value not in [
                        'Gene Symbol', 'Gene ID', 'Locus ID', 'Cytoband'
                ]:
                    sample_id_col_num += 1
                    biosource_biomaterial = self.biosource_biomaterial_from_sample_id(
                        col_value, filename)
                    biosource_biomaterial_dict.setdefault(
                        biosource_biomaterial[0],
                        []).append(biosource_biomaterial[1])
            if sample_id_col_num == 0:
                raise ReaderException(
                    "Cannot read NGS data from file: {}. No sample_id found in header"
                    .format(filename))
        else:
            raise ReaderException(
                "Cannot read NGS data from file: {}. Empty data.".format(
                    filename))
        return self.map_ngs(biosource_biomaterial_dict, filename)
    def read_data(self, filename: str) -> Optional[Sequence[NGS]]:
        """ Reads .maf.gz. file.
        Sample_id should be specified in the :attr:`self.sample_id_column_name` column.

        :param filename: name of the input file
        :return: Sequence of NGS objects
        """
        data = TabularFileReader(os.path.join(self.input_dir,
                                              filename)).read_data()
        biosource_biomaterial_dict = dict()
        if len(data) > 1:
            for row in data:
                try:
                    col_value = row[self.sample_id_column_name]
                except KeyError:
                    raise ReaderException(
                        "Invalid {} file. No column with name {}. Cannot read sample ids."
                        .format(filename, self.sample_id_column_name))
                biosource_biomaterial = self.biosource_biomaterial_from_sample_id(
                    col_value, filename)
                biosource_biomaterial_dict.setdefault(
                    biosource_biomaterial[0],
                    []).append(biosource_biomaterial[1])
        else:
            raise ReaderException(
                "Cannot read NGS data from file: {}. Empty data.".format(
                    filename))
        return self.map_ngs(biosource_biomaterial_dict, filename)
    def read_data(self, filename: str) -> Optional[Sequence[NGS]]:
        """ Reads .txt file.
        Sample_id should be specified in the header. Assumes that the IDs will start with 'PMC'.

        :param filename: name of the input file
        :return: Sequence of NGS objects
        """
        data = TabularFileReader(os.path.join(self.input_dir,
                                              filename)).read_data()
        biosource_biomaterial_dict = dict()
        if data:
            sample_id_col_num = 0
            for col_value in data[0]:
                if col_value.startswith('PMC'):
                    sample_id_col_num += 1
                    biosource_biomaterial = self.biosource_biomaterial_from_sample_id(
                        col_value, filename)
                    biosource_biomaterial_dict.setdefault(
                        biosource_biomaterial[0],
                        []).append(biosource_biomaterial[1])
            if sample_id_col_num == 0:
                raise ReaderException(
                    "Cannot read NGS data from file: {}. No sample_id found in header"
                    .format(filename))
        else:
            raise ReaderException(
                "Cannot read NGS data from file: {}. Empty data.".format(
                    filename))
        return self.map_ngs(biosource_biomaterial_dict, filename)
Beispiel #5
0
def test_transformation(tmp_path):
    target_path = tmp_path.as_posix()
    runner = CliRunner()
    result = runner.invoke(sources2csr.run, [
        './test_data/input_data/CLINICAL',
        target_path,
        './test_data/input_data/config'
    ])
    assert result.exit_code == 0

    assert path.exists(target_path + '/individual.tsv')
    assert path.exists(target_path + '/diagnosis.tsv')
    assert path.exists(target_path + '/biosource.tsv')
    assert path.exists(target_path + '/biomaterial.tsv')
    assert path.exists(target_path + '/study.tsv')
    assert path.exists(target_path + '/individual_study.tsv')

    # test if codebook mapping has been applied
    individual_data = TabularFileReader(path.join(target_path, 'individual.tsv')).read_data()
    p1 = [ind for ind in individual_data if ind['individual_id'] == 'P1'][0]
    assert p1['gender'] == 'female'

    # test if derived values have been calculated, if not read from the source
    assert p1['diagnosis_count'] == '2'
    assert p1['age_first_diagnosis'] == '23'  # 01-05-2016 - 01-02-1993

    # test if aggregate values have been correctly inserted from the source
    p2 = [ind for ind in individual_data if ind['individual_id'] == 'P2'][0]
    assert p2['diagnosis_count'] == '4'
    assert p2['age_first_diagnosis'] == '50'

    # check if data from second input file is included
    p2 = [ind for ind in individual_data if ind['individual_id'] == 'P2'][0]
    assert p2['ic_withdrawn_date'] == '2018-06-02'

    # check if data from higher priority files are not overwritten
    p6 = [ind for ind in individual_data if ind['individual_id'] == 'P6'][0]
    assert p6['ic_withdrawn_date'] == '2017-10-14'

    biosource_data = TabularFileReader(path.join(target_path, 'biosource.tsv')).read_data()
    # test reading of biosources from CSV file
    bs1 = [biosource for biosource in biosource_data if biosource['biosource_id'] == 'BS1'][0]
    assert bs1['tissue'] == 'medula'
    assert bs1['biosource_date'] == '2017-03-12'
    assert bs1['tumor_percentage'] == '5'
    def read_data(self, filename: str) -> Optional[Sequence[NGS]]:
        """ Reads .seg file as tab separated file.
        Sample ID should be specified in the first column.

        :param filename: name of the input file
        :return: Sequence of NGS objects
        """
        data = TabularFileReader(os.path.join(self.input_dir, filename)).read_data()
        biosource_biomaterial_dict = dict()
        if len(data) > 1:
            for row in data:
                sample_id = list(row.values())[0]
                biosource_biomaterial = self.biosource_biomaterial_from_sample_id(sample_id, filename)
                biosource_biomaterial_dict.setdefault(biosource_biomaterial[0], []).append(biosource_biomaterial[1])
        else:
            raise ReaderException("Cannot read NGS data from file: {}. Empty data.".format(filename))
        return self.map_ngs(biosource_biomaterial_dict, filename)
Beispiel #7
0
    def read_entities(self, file_path: str,
                      entity_type: Type[BaseModel]) -> List[Any]:
        try:
            data = TabularFileReader(file_path).read_data()
        except FileNotFoundError:
            return []

        date_fields = self.get_date_fields(entity_type.schema())
        array_fields = self.get_array_fields(entity_type.schema())

        for row in data:
            for field, value in row.items():
                if value == '' or value == 'NA':
                    row[field] = None
                elif field in date_fields:
                    row[field] = datetime.strptime(value, '%Y-%m-%d')
                elif field in array_fields:
                    row[field] = json.loads(value)
        return [entity_type(**d) for d in list(data)]