Beispiel #1
0
 def test_novaseq_runinfo(self):
     """ A (more recent) NovaSeq example
     """
     rip = RunInfoXMLParser( DATA_DIR + '/180619_A00291_0044_BH5WJJDMXX' )
     self.assertEqual(rip.run_info, {
                         'Cycles': '51 [8] [8] 51',
                         'Flowcell': 'H5WJJDMXX',
                         'FCType': 'S2',
                         'Instrument': 'novaseq_A00291',
                         'LaneCount': 2,
                         'RunDate': '2018-06-19',
                         'RunId': '180619_A00291_0044_BH5WJJDMXX'
     })
Beispiel #2
0
 def test_miseq_runinfo2(self):
     """ A newer MiSeq example
     """
     rip = RunInfoXMLParser( DATA_DIR + '/180430_M05898_0007_000000000-BR92R' )
     self.assertEqual(rip.run_info, {
                         'Cycles': '26 [8] [8] 26',
                         'Flowcell': 'BR92R',
                         'FCType': 'Normal v2',
                         'Instrument': 'miseq_M05898',
                         'LaneCount': 1,
                         'RunDate': '2018-04-30',
                         'RunId': '180430_M05898_0007_000000000-BR92R'
     })
Beispiel #3
0
 def test_miseq_runinfo1(self):
     """ A MiSeq example
     """
     rip = RunInfoXMLParser( DATA_DIR + '/150602_M01270_0108_000000000-ADWKV' )
     self.assertEqual(rip.run_info, {
                         'Cycles': '301 [8] 301',
                         'Flowcell': 'ADWKV',
                         'FCType': 'Normal v3',
                         'Instrument': 'miseq_M01270',
                         'LaneCount': 1,
                         'RunDate': '2015-06-02',
                         'RunId': '150602_M01270_0108_000000000-ADWKV'
     })
Beispiel #4
0
 def test_rixp_date_bug(self):
     """ Turns out that dates before the 10th of the month were being mangled.
         Oops.
         Note this flowell is XP but the RIXP calls it as S1 because it doesn't look
         at the RunParameters file - this is expected.
     """
     rip = RunInfoXMLParser( DATA_DIR + '/210601_A00291_0371_AHF2HCDRXY' )
     self.assertEqual(rip.run_info, {
                         'Cycles': '51 [8] [8] 51',
                         'Flowcell': 'HF2HCDRXY',
                         'FCType': 'S1', # But not really
                         'Instrument': 'novaseq_A00291',
                         'LaneCount': 2,
                         'RunDate': '2021-06-01',
                         'RunId': '210601_A00291_0371_AHF2HCDRXY'
     })
Beispiel #5
0
    def __init__(self, run_folder, opts=''):

        # here the RunInfo.xml is parsed into an object
        self.run_path_folder = run_folder
        # In the case where we're looking at a fastqdata directory, examine the
        # seqdata link
        if os.path.isdir(
                os.path.join(self.run_path_folder, 'seqdata', 'pipeline')):
            self.run_path_folder = os.path.join(self.run_path_folder,
                                                'seqdata')

        self.quick_mode = 'q' in opts

        runinfo_xml_location = os.path.join(self.run_path_folder,
                                            'RunInfo.xml')
        self._exists_cache = {}

        self.trigger_cycles = [1]
        self.last_read1_read = 1

        try:
            if self.quick_mode:
                # We only care about instrument (and pipelinestatus)
                self.runinfo_xml = QuickInfo(self.run_path_folder)
            else:
                self.runinfo_xml = RunInfoXMLParser(runinfo_xml_location)

                #Get a list of the first cycle number of each read
                for r, l in sorted(self.runinfo_xml.read_and_length.items()):
                    self.trigger_cycles.append(self.trigger_cycles[-1] +
                                               int(l))

                #At some point, we might redefine read1 as ending after the last index read.
                #For now, we have it ending after the actual first read.

                # try:
                #     self.last_read1_read = max( k for k, v in self.runinfo_xml.read_and_indexed.items()
                #                                 where v == 'Y' )
                # except ValueError:
                #     # No index reads. Keep the default value of 1.
                #     pass

        except Exception:
            #if we can't read it we can't get much info
            if os.environ.get('DEBUG', '0') != '0': raise
            self.runinfo_xml = None
    def __init__(self, run_dir, lane, revcomp):

        #Read data_dir and check that requested lane is in the SampleSheet.
        self.run_dir = run_dir

        self.samplesheet = os.path.join(self.run_dir, "SampleSheet.csv")
        self.runinfo_file = os.path.join(self.run_dir, "RunInfo.xml")

        # This code is a little crufty but is tested and working.
        self.bme = BaseMaskExtractor(self.samplesheet, self.runinfo_file)

        # Get the run name from the RunInfo.xml
        self.run_info = RunInfoXMLParser(self.run_dir).run_info

        # Check the lane is valid. Note lane must be a str
        assert self.bme.get_lanes(), \
            "SampleSheet.csv does not seem to list any lanes."
        assert str(lane) == lane, \
            "{!r} is not a string".format(lane)
        assert lane  in [ str(l) for l in self.bme.get_lanes() ], \
            "{!r} not in {!r}".format(lane, self.bme.get_lanes())
        self.lane = lane

        # This sets up self.ini_settings()
        self.get_ini_settings()

        # See if we want to revcomp at all
        if revcomp == 'auto':
            self.revcomp = self.infer_revcomp()
            self.revcomp_label = 'auto ' + (self.revcomp or 'none')
        elif not revcomp:
            self.revcomp = ''
            self.revcomp_label = 'none'
        elif revcomp == 'none':
            # Explicitly none as opposed to implicitly none
            self.revcomp = ''
            self.revcomp_label = 'override none'
        else:
            self.revcomp = revcomp
            self.revcomp_label = 'override ' + revcomp
Beispiel #7
0
 def test_4k_runinfo(self):
     """ Since we no longer use the 4000, I have not added the flowcell type to the list.
     """
     rip = RunInfoXMLParser( DATA_DIR + '/160614_K00368_0023_AHF724BBXX' )
     self.assertEqual(rip.run_info['FCType'], '8/2/2/28')
Beispiel #8
0
def scan_for_info(run_dir, project_name_list=''):
    """Hoovers up the info and builds a data structure which can
       be serialized to YAML.
    """
    # Load both the RunInfo.xml and (a little later) the SampleSheet.csv
    ri_xml = RunInfoXMLParser(run_dir)

    # Build run info data structure (rids). First just inherit the info
    # from ri_xml (RunId, Instrument, Flowcell, ...)
    rids = ri_xml.run_info.copy()

    # We need this to reliably get the NovoSeq flowcell type
    # Also we now care about the experiment name which is here and lets us link to BaseSpace
    try:
        run_params = RunParametersXMLParser(run_dir).run_parameters
        if 'Flowcell Type' in run_params:
            rids['FCType'] = run_params['Flowcell Type']
        rids['ExperimentName'] = run_params.get('Experiment Name')
        # This is a CTime based on file timestamps. RunDate on the NovaSeq also
        # gives a timestamp but not on the MiSeq, even post-upgrade. And I don't
        # trus the MiSEQ clock in any case.
        rids['RunStartTime'] = run_params.get('Start Time')

        rids['Chemistry'] = get_chemistry(run_params, rids['Instrument'])
    except Exception:
        # Not to worry we can do without this.
        pass

    # Reads are pairs (length, index?)
    rids['CyclesAsList'] = [
        (ri_xml.read_and_length[i], ri_xml.read_and_indexed[i] == 'Y')
        for i in sorted(ri_xml.read_and_length.keys(), key=int)
    ]

    #Which file is actually providing the SampleSheet?
    try:
        rids['SampleSheet'] = os.path.basename(
            os.readlink(run_dir + "/SampleSheet.csv"))
    except OSError:
        # Weird - maybe not a link?
        rids['SampleSheet'] = "SampleSheet.csv"
    try:
        ss_csv = SampleSheetReader(run_dir + "/SampleSheet.csv")
    except Exception:
        # We can live without this if the sample sheet is invalid
        ss_csv = None

    #When is this  report being made?
    rids['ReportDateTime'] = printable_date()

    #Slice the sample sheet by lane
    rids['Lanes'] = []
    rids['ProjectInfo'] = {}

    if ss_csv:
        # Snag the 'real' experiment name
        rids['ExperimentSS'] = ss_csv.headers.get('Experiment Name')

        #Translate all the project numbers to names in one go
        #If you try to feed this script an old 2500 Sample Sheet this is where it will fail.
        assert 'sampleproject' not in ss_csv.column_mapping, \
            "A sampleproject (without the underscore) column was found. Is this an old 2500 SampleSheet?"
        rids['ProjectInfo'] = project_real_name(
            set([
                line[ss_csv.column_mapping['sample_project']]
                for line in ss_csv.samplesheet_data
            ]), project_name_list)

        # NOTE - if a samplesheet has no 'lane' column then we shouldn't really be processing it,
        # but as far as bcl2fastq is concerned this just means all lanes are identical, so for
        # the purposes of this script I'll go with that.
        if 'lane' in ss_csv.column_mapping:
            ss_lanes = [
                line[ss_csv.column_mapping['lane']]
                for line in ss_csv.samplesheet_data
            ]
        else:
            ss_lanes = [str(x + 1) for x in range(int(rids['LaneCount']))]

        for lanenum in sorted(set(ss_lanes)):
            thislane = {'LaneNumber': lanenum}

            #Add lane loading. In reality we probably need to get all lanes in one fetch,
            #but here's a placeholder.
            thislane['Loading'] = get_lane_loading(rids['Flowcell'])

            lines_for_lane = [
                line for line in ss_csv.samplesheet_data
                if 'lane' not in ss_csv.column_mapping
                or line[ss_csv.column_mapping['lane']] == lanenum
            ]

            thislane['Contents'] = summarize_lane(lines_for_lane,
                                                  ss_csv.column_mapping)

            #If the lane contains a single sample, is that one barcode or is it unindexed?
            #We'd like to report which.
            if len(lines_for_lane) == 1:
                index_lengths = ss_csv.get_index_lengths_by_lane()[lanenum]
                #It's unindexed if there are no indices or if they contain only N's.
                thislane['Unindexed'] = not any(index_lengths)
            else:
                thislane['Unindexed'] = False

            rids['Lanes'].append(thislane)

    return rids