Beispiel #1
0
 def test_load_md5_from_file(self):
     '''test load_md5_from_file'''
     expected = '43247f482b82e38a190c4d3243f97ea8'
     prefix = os.path.join(data_dir, 'load_md5_from_file.')
     self.assertEqual(expected,
                      utils.load_md5_from_file(prefix + 'good_mac'))
     self.assertEqual(expected,
                      utils.load_md5_from_file(prefix + 'good_linux'))
     with self.assertRaises(utils.Error):
         utils.load_md5_from_file(prefix + 'bad_mac')
     with self.assertRaises(utils.Error):
         utils.load_md5_from_file(prefix + 'bad_linux')
Beispiel #2
0
 def test_load_md5_from_file(self):
     """test load_md5_from_file"""
     expected = "43247f482b82e38a190c4d3243f97ea8"
     prefix = os.path.join(data_dir, "load_md5_from_file.")
     self.assertEqual(expected,
                      utils.load_md5_from_file(prefix + "good_mac"))
     self.assertEqual(expected,
                      utils.load_md5_from_file(prefix + "good_linux"))
     with self.assertRaises(Exception):
         utils.load_md5_from_file(prefix + "bad_mac")
     with self.assertRaises(Exception):
         utils.load_md5_from_file(prefix + "bad_linux")
    def _validate_data(cls, database, data, dropbox_dir):
        """Input should be data, made by spreadsheet_helper.load_data_from_spreadsheet().
           Sanity checks that it is ok, and returns a list of error messages.
           If the list has length zero, then all is OK."""
        errors = []
        all_filenames = {}
        all_replicates = {}
        replicate_keys = (
            "subject_id",
            "site_id",
            "lab_id",
            "isolate_number",
            "sequence_replicate_number",
        )

        for data_dict in data:
            if type(data_dict["submission_date"]) is not datetime.date:
                errors.append(
                    "Date format error: " +
                    spreadsheet_helper.row_data_dict_to_string(data_dict))

            for i in [1, 2]:
                read_file_key = "reads_file_" + str(i)
                filename = data_dict[read_file_key]
                md5_key = read_file_key + "_md5"

                if not os.path.exists(os.path.join(dropbox_dir, filename)):
                    errors.append("Reads file not found: " + filename)

                all_filenames[filename] = all_filenames.get(filename, 0) + 1

                md5_file = os.path.join(dropbox_dir, filename + ".md5")
                if os.path.exists(md5_file):
                    md5sum_from_file = utils.load_md5_from_file(md5_file)
                else:
                    md5sum_from_file = None

                if md5sum_from_file is None and data_dict[md5_key] is None:
                    errors.append("No md5 for reads file " + filename)
                elif (md5sum_from_file is not None
                      and data_dict[md5_key] is not None
                      and md5sum_from_file != data_dict[md5_key]):
                    errors.append("Mismatch in md5 info for reads file " +
                                  filename)
                elif data_dict[
                        md5_key] is None and md5sum_from_file is not None:
                    data_dict[md5_key] = md5sum_from_file

            replicate = tuple([data_dict[x] for x in replicate_keys])
            all_replicates[replicate] = all_replicates.get(replicate, 0) + 1

            patient_site_lab_unique, replicates_exist, sample_id = database._get_sample_and_replicate_uniqueness(
                data_dict)

            if not patient_site_lab_unique:
                errors.append(
                    "Subject(" + data_dict["subject_id"] + ") + site(" +
                    data_dict["site_id"] + ") + lab(" + data_dict["lab_id"] +
                    ") found more than once in database. Something very wrong!"
                )

            if replicates_exist:
                errors.append("Replicate already found for " +
                              ",".join(replicate_keys) + ": " +
                              ",".join([data_dict[x] for x in replicate_keys]))

        for filename, count in sorted(all_filenames.items()):
            if count > 1:
                errors.append("Reads file " + filename + " found " +
                              str(count) + " times")

        for replicate, count in sorted(all_replicates.items()):
            if count > 1:
                errors.append("Replicate " + ",".join(replicate_keys) + " " +
                              ",".join(replicate) + " found " + str(count) +
                              " times in spreadsheet")

        return errors
    def _validate_data(cls, database, data, dropbox_dir):
        '''Input should be data, made by spreadsheet_helper.load_data_from_spreadsheet().
           Sanity checks that it is ok, and returns a list of error messages.
           If the list has length zero, then all is OK.'''
        errors = []
        all_filenames = {}
        all_replicates = {}
        replicate_keys = ('subject_id', 'site_id', 'lab_id', 'isolate_number',
                          'sequence_replicate_number')

        for data_dict in data:
            if type(data_dict['submission_date']) is not datetime.date:
                errors.append(
                    'Date format error: ' +
                    spreadsheet_helper.row_data_dict_to_string(data_dict))

            for i in [1, 2]:
                read_file_key = 'reads_file_' + str(i)
                filename = data_dict[read_file_key]
                md5_key = read_file_key + '_md5'

                if not os.path.exists(os.path.join(dropbox_dir, filename)):
                    errors.append('Reads file not found: ' + filename)

                all_filenames[filename] = all_filenames.get(filename, 0) + 1

                md5_file = os.path.join(dropbox_dir, filename + '.md5')
                if os.path.exists(md5_file):
                    md5sum_from_file = utils.load_md5_from_file(md5_file)
                else:
                    md5sum_from_file = None

                if md5sum_from_file is None and data_dict[md5_key] is None:
                    errors.append('No md5 for reads file ' + filename)
                elif md5sum_from_file is not None and data_dict[
                        md5_key] is not None and md5sum_from_file != data_dict[
                            md5_key]:
                    errors.append('Mismatch in md5 info for reads file ' +
                                  filename)
                elif data_dict[
                        md5_key] is None and md5sum_from_file is not None:
                    data_dict[md5_key] = md5sum_from_file

            replicate = tuple([data_dict[x] for x in replicate_keys])
            all_replicates[replicate] = all_replicates.get(replicate, 0) + 1

            patient_site_lab_unique, replicates_exist, sample_id = database._get_sample_and_replicate_uniqueness(
                data_dict)

            if not patient_site_lab_unique:
                errors.append(
                    'Subject(' + data_dict['subject_id'] + ') + site(' +
                    data_dict['site_id'] + ') + lab(' + data_dict['lab_id'] +
                    ') found more than once in database. Something very wrong!'
                )

            if replicates_exist:
                errors.append('Replicate already found for ' +
                              ','.join(replicate_keys) + ': ' +
                              ','.join([data_dict[x] for x in replicate_keys]))

        for filename, count in sorted(all_filenames.items()):
            if count > 1:
                errors.append('Reads file ' + filename + ' found ' +
                              str(count) + ' times')

        for replicate, count in sorted(all_replicates.items()):
            if count > 1:
                errors.append('Replicate ' + ','.join(replicate_keys) + ' ' +
                              ','.join(replicate) + ' found ' + str(count) +
                              ' times in spreadsheet')

        return errors