Beispiel #1
0
 def make_spreadsheet_for_ena_download(self):
     sheet = Spreadsheet()
     sheet.supplier, sheet.organisation, sheet.contact, sheet.technology, sheet.name, sheet.accession, sheet.size, \
     sheet.limit = ('Supplier', 'Org', 'Contact', 'Illumina', 'AStudyName1', None, 1.90, '01/01/2025')
     sheet.reads = [RawRead(forward_read='PAIR1', reverse_read='T', sample_name='SAMPLE1',
                            taxon_id='1280', library_name='LIB1', sample_accession=None),
                    RawRead(forward_read='Pair2.fastq.gz', reverse_read='F', sample_name='SAMPLE2',
                            taxon_id='1280', library_name='LIB2', sample_accession=None)]
     return sheet
Beispiel #2
0
 def test_study_name_with_invalid_char_should_fail_validation(self):
     self.assertEqual(
         33,
         len(
             validate_study_name(
                 Spreadsheet.new_instance(
                     "!\"£$%^&*()+={}[]:@~;'#?/>.<,|\\`¬\t"))))
Beispiel #3
0
 def test_supplier_name_with_invalid_char_should_fail_validation(self):
     self.assertEqual(
         33,
         len(
             validate_no_abnormal_characters_in_supplier_name(
                 Spreadsheet.new_instance(
                     "name",
                     supplier="!\"£$%^&*()+={}[]:@~;'#?/>.<,|\\`¬\t"))))
Beispiel #4
0
 def test_invalid_name_for_external_data_part_of_internal_study(self):
     self.assertEqual(
         [
             "Data part of internal sequencing study should have the suffix '_external' in the name: "
             "ValidName12345__"
         ],
         validate_external_data_part_of_internal_sequencing_study_name(
             Spreadsheet.new_instance("ValidName12345__", [])))
Beispiel #5
0
 def test_sample_name_with_valid_char_should_pass_validation(self):
     self.assertEqual([],
                      validate_sample_names(
                          Spreadsheet.new_instance("ValidName12345__", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name="SAMPLE_1",
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #6
0
 def test_copy_files_single_strand(self, copyfile_patch):
     under_test = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='SINGLE.fastq.gz', reverse_read=None, sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), 'destination', 0, 0)
     under_test.copy_files('source')
     self.assertEquals(copyfile_patch.call_args_list,
                       [call('source/PAIR1_1.fastq.gz', 'destination/0/PAIR1_1.fastq.gz'),
                        call('source/PAIR1_2.fastq.gz', 'destination/0/PAIR1_2.fastq.gz'),
                        call('source/SINGLE.fastq.gz', 'destination/0/SINGLE.fastq.gz')])
Beispiel #7
0
 def test_pair_naming_convention_is_valid_for_single_read(self):
     self.assertEqual([],
                      validate_pair_naming_convention(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #8
0
 def test_single_read_is_compressed(self):
     self.assertEqual([],
                      validate_files_are_compressed(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #9
0
 def test_no_hyphen_in_filename(self):
     self.assertEqual([],
                      validate_no_hyphen_in_filename(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #10
0
 def test_valid_taxon_id(self):
     self.assertEqual([],
                      validate_taxon_ids(
                          Spreadsheet.new_instance("ValidName12345__", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #11
0
 def test_mandatory_fields_for_reads_are_populated_single_read(self):
     self.assertEqual([],
                      validate_mandatory_read_fields(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='READ.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #12
0
 def test_forward_read_not_populated(self):
     with self.assertRaises(Exception):
         validate_mandatory_read_fields(
             Spreadsheet.new_instance("1234567890123456", [
                 RawRead(sample_accession=None,
                         forward_read=None,
                         reverse_read=None,
                         sample_name='SAMPLE1',
                         taxon_id="1280",
                         library_name='LIB1')
             ]))
Beispiel #13
0
 def test_none_is_not_valid(self):
     self.assertEqual(
         ["Double-ended is incorrectly formatted, must be T or F"],
         check_double_ended_column_is_T_or_F(
             Spreadsheet.new_instance("1234567890123456", [
                 RawRead(sample_accession=None,
                         forward_read='PAIR1_1.fastq.gz',
                         reverse_read=None,
                         sample_name='SAMPLE1',
                         taxon_id="1280",
                         library_name='LIB1')
             ])))
Beispiel #14
0
 def setUp(self):
     self.tempdir = TempDirectory()
     self.tempdir.write('1/Accession1.fastq.gz', b'the text')
     self.tempdir.write('2/Accession1_1.fastq.gz', b'the text')
     self.tempdir.write('2/Accession1_2.fastq.gz',b'the text')
     self.tempdir_path = self.tempdir.path
     print('temp',self.tempdir_path)
     self.under_test1 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 0, 0)
     self.under_test2 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 1, 0)
     self.under_test3 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 2, 0)
Beispiel #15
0
 def test_library_name_not_populated(self):
     self.assertEqual([
         "Missing library name for RawRead(forward_read='READ.fastq.gz', reverse_read=None, "
         "sample_name='SAMPLE1', sample_accession=None, taxon_id='1280', library_name=None)"
     ],
                      validate_mandatory_read_fields(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='READ.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name=None)
                          ])))
Beispiel #16
0
 def test_sample_name_with_invalid_char_should_fail_validation(self):
     self.assertEqual(
         34,
         len(
             validate_sample_names(
                 Spreadsheet.new_instance("ValidName12345__", [
                     RawRead(
                         sample_accession=None,
                         forward_read='PAIR1_1.fastq.gz',
                         reverse_read='PAIR1_2.fastq.gz',
                         sample_name="!\"£$%^&*()+={}[]:@~;'#?/>.<,|\\`¬\t ",
                         taxon_id="1280",
                         library_name='LIB1')
                 ]))))
Beispiel #17
0
 def test_path_in_filename_is_invalid(self):
     self.assertEqual(
         [
             "Path present in filename: /some/path/PAIR1_1.fastq.gz",
             "Path present in filename: /some/path/PAIR1_2.fastq.gz",
         ],
         validate_no_path_in_filename(
             Spreadsheet.new_instance("1234567890123456", [
                 RawRead(sample_accession=None,
                         forward_read='/some/path/PAIR1_1.fastq.gz',
                         reverse_read='/some/path/PAIR1_2.fastq.gz',
                         sample_name='SAMPLE1',
                         taxon_id="1280",
                         library_name='LIB1')
             ])))
Beispiel #18
0
 def test_invalid_pair_naming_convention(self):
     self.assertEqual([
         "Inconsistent naming convention of forward and reverse reads for RawRead("
         "forward_read='PAIR1xxx_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', "
         "sample_name='SAMPLE1', sample_accession=None, taxon_id='1280', library_name='LIB1')"
     ],
                      validate_pair_naming_convention(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1xxx_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #19
0
 def test_T_or_F_is_valid(self):
     self.assertEqual([],
                      check_double_ended_column_is_T_or_F(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='T',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1'),
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='F',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Beispiel #20
0
 def test_uniqueness_of_files_sample_and_library_ENA_download(self):
     self.assertEqual([],
                      validate_uniqueness_of_reads(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1',
                                      reverse_read='T',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1'),
                              RawRead(sample_accession=None,
                                      forward_read='PAIR2',
                                      reverse_read='F',
                                      sample_name='SAMPLE2',
                                      taxon_id="1280",
                                      library_name='LIB2')
                          ])))
Beispiel #21
0
 def test_forward_read_not_unique(self):
     self.assertEqual(["Forward read is not unique: PAIR1_1.fastq.gz"],
                      validate_uniqueness_of_reads(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1'),
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR2_2.fastq.gz',
                                      sample_name='SAMPLE2',
                                      taxon_id="1280",
                                      library_name='LIB2')
                          ])))
    def test_header_initialization_no_accession(self):
        loader = SpreadsheetLoader(
            os.path.join(self.data_dir,
                         'test_upload_no_pair_no_lib_no_accession.xls'))

        expected = Spreadsheet.new_instance("MyStudy", [
            self._raw_read('PAIR1_1.fastq.gz', None, 'SAMPLE1', 'LIB1', None),
            self._raw_read('PAIR2_1.fastq.gz', None, 'SAMPLE2', 'LIB2', None)
        ],
                                            contact="Some Name",
                                            organisation="ENA",
                                            supplier='ENA',
                                            technology='Illumina',
                                            size=123456.0,
                                            accession=None,
                                            limit='30/09/2020')
        actual = loader.load_xls()
        self.assertSpreadsheet(expected, actual)
Beispiel #23
0
 def test_reads_are_not_fastq(self):
     self.assertEqual([
         "Forward read file is not correctly formatted for RawRead(forward_read='PAIR1_1.gz', "
         "reverse_read='PAIR1_2.gz', sample_name='SAMPLE1', sample_accession=None, "
         "taxon_id='1280', library_name='LIB1')",
         "Reverse read file is not correctly formatted for RawRead(forward_read='PAIR1_1.gz', "
         "reverse_read='PAIR1_2.gz', sample_name='SAMPLE1', sample_accession=None, "
         "taxon_id='1280', library_name='LIB1')"
     ],
                      validate_files_are_compressed(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.gz',
                                      reverse_read='PAIR1_2.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
    def test_no_filename_only_run_accession(self):
        loader = SpreadsheetLoader(
            os.path.join(self.data_dir, 'test_run_accession.xls'))

        expected = Spreadsheet.new_instance("MyStudy", [
            self._raw_read('PAIR1', 'T', 'SAMPLE1', 'LIB1', 'ACCESSION1'),
            self._raw_read('PAIR2', 'T', 'SAMPLE2', 'LIB2', 'ACCESSION2'),
            self._raw_read('PAIR3', 'F', 'SAMPLE3', 'LIB3', 'ACCESSION3')
        ],
                                            contact="Some Name",
                                            organisation="ENA",
                                            supplier='ENA',
                                            technology='Illumina',
                                            size=123456.0,
                                            accession='accession',
                                            limit='30/09/2020')
        actual = loader.load_xls()
        self.assertSpreadsheet(expected, actual)
    def test_sample_and_library_names_as_integers(self):
        loader = SpreadsheetLoader(
            os.path.join(self.data_dir, 'test_sample_name_as_int.xls'))

        expected = Spreadsheet.new_instance("AStudyName1", [
            self._raw_read('ERR0000001_1.fastq.gz', 'ERR0000001_2.fastq.gz',
                           '101260', '1000000001', 'ERR0000001', '485'),
            self._raw_read('ERR0000002_1.fastq.gz', 'ERR0000002_2.fastq.gz',
                           '101264', '2000000002', 'ERR0000002', '485')
        ],
                                            contact="Me",
                                            organisation="Org",
                                            supplier='Supplier',
                                            technology='Illumina',
                                            size=1.90,
                                            accession=None,
                                            limit='01/01/2025')
        actual = loader.load_xls()
        self.assertSpreadsheet(expected, actual)
    def test_cells_read_xlsx(self):
        loader = SpreadsheetLoader(
            os.path.join(self.data_dir, 'test_upload.xlsx'))

        expected = Spreadsheet.new_instance("MyStudy", [
            self._raw_read('PAIR1_1.fastq.gz', 'PAIR1_2.fastq.gz', 'SAMPLE1',
                           'LIB1', 'ACCESSION1'),
            self._raw_read('PAIR2_1.fastq.gz', 'PAIR2_2.fastq.gz', 'SAMPLE2',
                           'LIB2', 'ACCESSION2')
        ],
                                            contact="Some Name",
                                            organisation="ENA",
                                            supplier='ENA',
                                            technology='Illumina',
                                            size=123456.0,
                                            accession='accession',
                                            limit='30/09/2020')
        actual = loader.load_xlsx()
        self.assertSpreadsheet(expected, actual)
Beispiel #27
0
 def load_xlsx(self):
     result = Spreadsheet()
     data_row = 0
     header_row = 0
     for i in range(10):
         if self._sheet.cell(row=i + 1, column=1).value == 'Study Name':
             result.name = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Supplier Name':
             result.supplier = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Supplier Organisation':
             result.organisation = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Sanger Contact Name':
             result.contact = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Sequencing Technology':
             result.technology = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Study Accession number':
             result.accession = self.__extract_text_value_xlsx(i + 1, 2)
         if self._sheet.cell(row=i + 1, column=1).value == 'Total size of files in GBytes':
             result.size = float(self._sheet.cell(row=i + 1, column=2).value)
         if self._sheet.cell(row=i + 1, column=1).value == 'Data to be kept until':
             result.limit = self._sheet.cell(row=i + 1, column=2).value.strftime('%d/%m/%Y')
         if self._sheet.cell(row=i + 1, column=1).value == 'Filename' or self._sheet.cell(row=i + 1, column=1).value == 'Run Accession':
             data_row = i + 2
             header_row = i + 1
             break
     filename_column = None
     run_accession_column = None
     for i in range(self._sheet.max_column):
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Filename':
             filename_column = i +1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Run Accession':
             run_accession_column = i +1
         if filename_column is not None:
             if self._sheet.cell(row=header_row, column=i + 1).value == 'Mate File':
                 mate_filename_column = i + 1
         if run_accession_column is not None:
             if self._sheet.cell(row=header_row, column=i + 1).value == 'Double-ended Reads':
                 double_ended_reads_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Sample Name':
             sample_name_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Sample Accession number':
             sample_accession_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Taxon ID':
             taxon_id_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Library Name':
             library_name_column = i + 1
     reads = []
     for i in range(data_row, self._sheet.max_row+1):
         sample_name = self.__extract_float_value_xlsx(i, sample_name_column)
         library_name = self.__extract_float_value_xlsx(i, library_name_column)
         if library_name is None:
             library_name = sample_name
         if filename_column is not None:
             reads.append(RawRead(
                 self.__extract_text_value_xlsx(i, filename_column),
                 self.__extract_text_value_xlsx(i, mate_filename_column),
                 sample_name,
                 self.__extract_text_value_xlsx(i, sample_accession_column),
                 self.__extract_float_value_xlsx(i, taxon_id_column),
                 library_name))
         if run_accession_column is not None:
             reads.append(RawRead(
                 (self.__extract_text_value_xlsx(i, run_accession_column)),
                 self.__extract_text_value_xlsx(i, double_ended_reads_column),
                 sample_name,
                 self.__extract_text_value_xlsx(i, sample_accession_column),
                 self.__extract_float_value_xlsx(i, taxon_id_column),
                 library_name))
     result.reads = reads
     return result
Beispiel #28
0
 def load_xls(self):
     result = Spreadsheet()
     data_row = 0
     header_row = 0
     for i in range(self._sheet.nrows):
         if self._sheet.cell_value(i, 0) == 'Study Name':
             result.name = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Supplier Name':
             result.supplier = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Supplier Organisation':
             result.organisation = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Sanger Contact Name':
             result.contact = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Sequencing Technology':
             result.technology = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Study Accession number':
             result.accession = self.__extract_text_value_xls(i, 1)
         if self._sheet.cell_value(i, 0) == 'Total size of files in GBytes':
             result.size = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Data to be kept until':
             year, month, day, hour, minute, second = xlrd.xldate_as_tuple(self._sheet.cell_value(i, 1),
                                                                           self._workbook.datemode)
             result.limit = "%02d/%02d/%04d" % (day, month, year)
         if self._sheet.cell_value(i, 0) == 'Filename' or self._sheet.cell_value(i, 0) == 'Run Accession':
             data_row = i + 1
             header_row = i
             break
     filename_column = None
     run_accession_column = None
     for i in range(self._sheet.ncols):
         if self._sheet.cell_value(header_row, i) == 'Filename':
             filename_column = i
         if self._sheet.cell_value(header_row, i) == 'Run Accession':
             run_accession_column = i
         if filename_column is not None:
             if self._sheet.cell_value(header_row, i) == 'Mate File':
                 mate_filename_column = i
         if run_accession_column is not None:
             if self._sheet.cell_value(header_row, i) == 'Double-ended Reads':
                 double_ended_reads_column = i
         if self._sheet.cell_value(header_row, i) == 'Sample Name':
             sample_name_column = i
         if self._sheet.cell_value(header_row, i) == 'Sample Accession number':
             sample_accession_column = i
         if self._sheet.cell_value(header_row, i) == 'Taxon ID':
             taxon_id_column = i
         if self._sheet.cell_value(header_row, i) == 'Library Name':
             library_name_column = i
     reads = []
     for i in range(data_row, self._sheet.nrows):
         sample_name = self.__extract_float_value_xls(i, sample_name_column)
         library_name = self.__extract_float_value_xls(i, library_name_column)
         if library_name is None:
             library_name = sample_name
         if filename_column is not None:
             reads.append(RawRead(
                 self.__extract_text_value_xls(i, filename_column),
                 self.__extract_text_value_xls(i, mate_filename_column),
                 sample_name,
                 self.__extract_text_value_xls(i, sample_accession_column),
                 self.__extract_float_value_xls(i, taxon_id_column),
                 library_name))
         if run_accession_column is not None:
             reads.append(RawRead(
                 (self.__extract_text_value_xls(i, run_accession_column)),
                 self.__extract_text_value_xls(i, double_ended_reads_column),
                 sample_name,
                 self.__extract_text_value_xls(i, sample_accession_column),
                 self.__extract_float_value_xls(i, taxon_id_column),
                 library_name))
     result.reads = reads
     return result
Beispiel #29
0
 def test_supplier_name_with_valid_char_should_pass_validation(self):
     self.assertEqual([],
                      validate_no_abnormal_characters_in_supplier_name(
                          Spreadsheet.new_instance(
                              "name", supplier="This should work")))
Beispiel #30
0
 def test_valid_name_for_external_data_part_of_internal_study(self):
     self.assertEqual(
         [],
         validate_external_data_part_of_internal_sequencing_study_name(
             Spreadsheet.new_instance("345_external", [])))