Ejemplo n.º 1
0
def from_gwu_chem_UVVIS(filelist,
                        sortnames=False,
                        shortname=True,
                        cut_extension=False,
                        name=''):
    ''' Format for comma delimited two column data from GWU chemistry's UVVis.  These have no useful metadata
    or dark data and so it is important that users either pass in a correctly sorted filelist.  Once the 
    dataframe is created, on can do df=df.reindex(columns=[correct order]).  
    
    It uses read_csv() to and creates a list of dataframes.  Afterwards, concat() merges these.
    
    Kwds:
       sortnames- Will attempt to autosort the filelist. Otherwise, order of files passed in is
                  directly used as columns.
       shortname- If false, full file path is used as the column name.  If true, only the filename is used. 
       
       cut_extension- If using the shortname, this will determine if the file extension is saved or cut from the data.'''

    if shortname:
        fget = lambda x: get_shortname(x, cut_extension=cut_extension)
    else:
        fget = lambda x: x

    ### Either full names or short names of filelist
    working_names = [fget(afile) for afile in filelist]

    dflist = [
        read_csv(
            afile,
            sep=',',
            header=None,
            index_col=0,
            skiprows=2,
            na_values=' ',  #Used to be ' \r', or is this from IR?
            names=[fget(afile)]) for afile in filelist
    ]

    ### THIS IS BUSTED, PUTTING NANS EVERYWHERE EXCEPT ONE FILE, but dflist itself ws nice.
    dataframe = concat(dflist, axis=1)

    ### concat tries to sort these, so this will preserve the sort order
    if sortnames:
        dataframe = dataframe.reindex(columns=sorted(working_names))

    dataframe = TimeSpectra(dataframe)  #this is fine

    dataframe.metadata = None
    dataframe.filedict = None
    dataframe.baseline = None
    dataframe.specunit = 'nm'  #This autodetected in plots
    if name:
        dataframe.name = name

    return dataframe
Ejemplo n.º 2
0
def from_gwu_chem_UVVIS(filelist, sortnames=False, shortname=True, cut_extension=False, name=''):
    ''' Format for comma delimited two column data from GWU chemistry's UVVis.  These have no useful metadata
    or dark data and so it is important that users either pass in a correctly sorted filelist.  Once the 
    dataframe is created, on can do df=df.reindex(columns=[correct order]).  
    
    It uses read_csv() to and creates a list of dataframes.  Afterwards, concat() merges these.
    
    Kwds:
       sortnames- Will attempt to autosort the filelist. Otherwise, order of files passed in is
                  directly used as columns.
       shortname- If false, full file path is used as the column name.  If true, only the filename is used. 
       
       cut_extension- If using the shortname, this will determine if the file extension is saved or cut from the data.'''

    if shortname:
        fget=lambda x:get_shortname(x, cut_extension=cut_extension)
    else:
        fget=lambda x: x
    
    ### Either full names or short names of filelist    
    working_names=[fget(afile) for afile in filelist]
        

    dflist=[read_csv(afile, sep=',', header=None, index_col=0, skiprows=2, na_values=' ',  #Used to be ' \r', or is this from IR?
                               names=[fget(afile)]) for afile in filelist]
    
    ### THIS IS BUSTED, PUTTING NANS EVERYWHERE EXCEPT ONE FILE, but dflist itself ws nice.
    dataframe=concat(dflist, axis=1)
                        
    ### concat tries to sort these, so this will preserve the sort order
    if sortnames:
        dataframe=dataframe.reindex(columns=sorted(working_names))

    dataframe=TimeSpectra(dataframe) #this is fine

    dataframe.metadata=None
    dataframe.filedict=None
    dataframe.baseline=None
    dataframe.specunit='nm' #This autodetected in plots    
    if name:
        dataframe.name=name
    
    return dataframe
Ejemplo n.º 3
0
def from_spec_files(file_list, name='', skiphead=17, skipfoot=1, check_for_overlapping_time=True, extract_dark=True):
    ''' Takes in raw files directly from Ocean optics USB2000 and USB650 spectrometers and returns a
    skspec TimeSpectra. If spectral data stored without header, can be called with skiphead=0.

    Parameters
    ----------
       name: Set name of returned TimeSpectra.
       
       check_for_overlapping_time: will raise errors if any files have identical times. Otherwise, time
                                   is overwritten.  Really only useful for testing or otherwise cornercase instances.
       
       extract_dark: Attempt to find a filename with caseinsenstive string match to "dark".  If dark spectrum
                     not found, will print warning.  If multiple darks found, will raise error.
      
       skiphead/skipfoot: Mostly for reminder that this filetype has a 17 line header and a 1 line footer.
       
    Notes
    -----
        Built to work with 2-column data only!!!

        Dataframe is constructed from a list of dictionaries.
        Each dataframe gets an appended headerdata attribute (dataframe.headerdata) which is a dictionary,
        keyed by columns and stores (infile, header, footer) data so no info is lost between files.

        Constructed to work for non-equally spaced datafiles, or non-identical data (aka wavelengths can have nans).
    '''

    dict_of_series={} #Dict of series eventually merged to dataframe   
    time_file_dict={} #Dict of time:filename (darkfile intentionally excluded)
    
    _overlap_count = 0 # Tracks if overlapping occurs

    ### If looking for a darkfile, this will find it.  Bit redundant but I'm lazy..###
    if extract_dark:
        darkfile=extract_darkfile(file_list, return_null=True)

        if darkfile:
            with open(darkfile) as f:
                header=[f.next().strip() for x in xrange(skiphead)]

            wavedata=np.genfromtxt(darkfile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) 
            darktime=_get_datetime_specsuite(header)        
            baseline=Series(wavedata['intensity'], index=wavedata['wavelength'], name=darkfile)

            file_list.remove(darkfile)
            f.close()
        else:
            baseline=None
            
    file_list = [f for f in file_list 
                 if os.path.basename(f) != '.gitignore']

    for infile in file_list:

        ###Read in only the header lines, not all the lines of the file
        ###Strips and splits in one go
        with open(infile) as f:
            header=[f.next().strip() for x in xrange(skiphead)]

        #Store wavelength, intensity data in a 2-column datatime for easy itemlookup 
        #Eg wavedata['wavelength']
        wavedata=np.genfromtxt(infile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) 

        # Extract time data from header
        datetime=_get_datetime_specsuite(header) 
        
        if datetime in time_file_dict:
            _overlap_count += 1        

        # Make sure timepoints aren't overlapping with any others
        if check_for_overlapping_time and _overlap_count:
            raise IOError('Duplicate time %s found in between files %s, %s.'
                          ' To overwrite, set check_for_overlapping_time = False.'
                           %( datetime, infile, time_file_dict[datetime] ))            
            

        time_file_dict[datetime]=infile
        dict_of_series[datetime]=Series(wavedata['intensity'], index=wavedata['wavelength'])

        f.close()

    ### Make timespec, add filenames, baseline and metadata attributes (note, DateTimeIndex auto sorts!!)
    timespec=TimeSpectra(DataFrame(dict_of_series), name=name) #Dataframe beacuse TS doesn't handle dict of series
    timespec.specunit='nm'
    timespec.filedict=time_file_dict
    timespec.baseline=baseline  #KEEP THIS AS DARK SERIES RECALL IT IS SEPARATE FROM reference OR REFERENCE..  

    ### Take metadata from first file in filelist that isn't darkfile
    for infile in file_list:
        if infile != darkfile:
            with open(infile) as f:
                header=[f.next().strip() for x in xrange(skiphead)]         
            meta_partial=_get_metadata_fromheader(header)
            break      

    meta_general=get_headermetadata_dataframe(timespec, time_file_dict) 
    meta_general.update(meta_partial)
    timespec.metadata=meta_general   

    if _overlap_count:
        logger.warn('Time duplication found in %s of %s files.  Duplicates were '
            'removed!' % (_overlap_count, len(file_list)))
            
    return timespec
Ejemplo n.º 4
0
def from_timefile_datafile(datafile, timefile, extract_dark=True, name=''): 
    ''' Converts old-style spectral data from GWU phys lab into  
    a dataframe with timestamp column index and wavelength row indicies.

    Creates the DataFrame from a dictionary of Series, keyed by datetime.
    **name becomes name of dataframe''' 

    tlines=open(timefile,'r').readlines()
    tlines=[line.strip().split() for line in tlines]           
    tlines.pop(0)

    time_file_dict=dict((_get_datetime_timefile(tline),tline[0]) for tline in tlines)

    ### Read in data matrix, separate first row (wavelengths) from the rest of the data
    wavedata=np.genfromtxt(datafile, dtype='float', skip_header=1)
    data, wavelengths=wavedata[:,1::], wavedata[:,0] #Separate wavelength column

    ### Sort datetimes here before assigning/removing dark spec etc...
    sorted_tfd=sorted(time_file_dict.items())
    sorted_times, sorted_files=zip(*( (((i[0]), (i[1])) for i in sorted_tfd)))

    ### Seek darkfile.  If found, take it out of dataframe. ###
    if extract_dark:
        darkfile=extract_darkfile(sorted_files, return_null=True)   

    if darkfile:    
        ####Find baseline by reverse lookup (lookup by value) and get index position

        #darkindex, darktime=[(idx, time) for idx, (time, afile) in enumerate(sorted_tfd) if afile == darkfile][0]
        darkindex=sorted_files.index(darkfile)
        darktime=sorted_times[darkindex]
        baseline=Series(data[:,darkindex], index=wavelengths, name=darkfile) 


        del time_file_dict[darktime] #Intentionally remove
        sorted_times=list(sorted_times) #Need to do in two steps
        sorted_times.remove(darktime)
        data=np.delete(data, darkindex, 1)  #Delete dark column from numpy data           
    else:
        baseline=None

    dataframe=TimeSpectra(data, columns=sorted_times, index=wavelengths)      
    

    ### Add field attributes to dataframe
    dataframe.baseline=baseline 
    dataframe.filedict=time_file_dict
    if name:
        dataframe.name=name

    ### Get headermeta data from first line in timefile that isn't darkfile.  Only checks one line
    ### Does not check for consistency
    for line in tlines:
        if line[0]==darkfile:
            pass
        else:
            meta_partial=_get_headermetadata_timefile(line[0])  #DOUBLE CHECK THIS WORKS
            break   

    ### Extract remaining metadata (file/time info) and return ###
    meta_general=get_headermetadata_dataframe(dataframe, time_file_dict) 
    meta_general.update(meta_partial)
    dataframe.metadata=meta_general
    dataframe.specunit='nm'  #This autodetected in plots    

    ### Sort dataframe by ascending time (could also sort spectral data) ###
    dataframe.sort(axis=1, inplace=True) #axis1=columns

    return dataframe
Ejemplo n.º 5
0
def from_timefile_datafile(datafile, timefile, extract_dark=True, name=''):
    ''' Converts old-style spectral data from GWU phys lab into  
    a dataframe with timestamp column index and wavelength row indicies.

    Creates the DataFrame from a dictionary of Series, keyed by datetime.
    **name becomes name of dataframe'''

    tlines = open(timefile, 'r').readlines()
    tlines = [line.strip().split() for line in tlines]
    tlines.pop(0)

    time_file_dict = dict(
        (_get_datetime_timefile(tline), tline[0]) for tline in tlines)

    ### Read in data matrix, separate first row (wavelengths) from the rest of the data
    wavedata = np.genfromtxt(datafile, dtype='float', skip_header=1)
    data, wavelengths = wavedata[:,
                                 1::], wavedata[:,
                                                0]  #Separate wavelength column

    ### Sort datetimes here before assigning/removing dark spec etc...
    sorted_tfd = sorted(time_file_dict.items())
    sorted_times, sorted_files = zip(*((((i[0]), (i[1])) for i in sorted_tfd)))

    ### Seek darkfile.  If found, take it out of dataframe. ###
    if extract_dark:
        darkfile = extract_darkfile(sorted_files, return_null=True)

    if darkfile:
        ####Find baseline by reverse lookup (lookup by value) and get index position

        #darkindex, darktime=[(idx, time) for idx, (time, afile) in enumerate(sorted_tfd) if afile == darkfile][0]
        darkindex = sorted_files.index(darkfile)
        darktime = sorted_times[darkindex]
        baseline = Series(data[:, darkindex], index=wavelengths, name=darkfile)

        del time_file_dict[darktime]  #Intentionally remove
        sorted_times = list(sorted_times)  #Need to do in two steps
        sorted_times.remove(darktime)
        data = np.delete(data, darkindex,
                         1)  #Delete dark column from numpy data
    else:
        baseline = None

    dataframe = TimeSpectra(data, columns=sorted_times, index=wavelengths)

    ### Add field attributes to dataframe
    dataframe.baseline = baseline
    dataframe.filedict = time_file_dict
    if name:
        dataframe.name = name

    ### Get headermeta data from first line in timefile that isn't darkfile.  Only checks one line
    ### Does not check for consistency
    for line in tlines:
        if line[0] == darkfile:
            pass
        else:
            meta_partial = _get_headermetadata_timefile(
                line[0])  #DOUBLE CHECK THIS WORKS
            break

    ### Extract remaining metadata (file/time info) and return ###
    meta_general = get_headermetadata_dataframe(dataframe, time_file_dict)
    meta_general.update(meta_partial)
    dataframe.metadata = meta_general
    dataframe.specunit = 'nm'  #This autodetected in plots

    ### Sort dataframe by ascending time (could also sort spectral data) ###
    dataframe.sort(axis=1, inplace=True)  #axis1=columns

    return dataframe
Ejemplo n.º 6
0
def from_spec_files(file_list,
                    name='',
                    skiphead=17,
                    skipfoot=1,
                    check_for_overlapping_time=True,
                    extract_dark=True):
    ''' Takes in raw files directly from Ocean optics USB2000 and USB650 spectrometers and returns a
    skspec TimeSpectra. If spectral data stored without header, can be called with skiphead=0.

    Parameters
    ----------
       name: Set name of returned TimeSpectra.
       
       check_for_overlapping_time: will raise errors if any files have identical times. Otherwise, time
                                   is overwritten.  Really only useful for testing or otherwise cornercase instances.
       
       extract_dark: Attempt to find a filename with caseinsenstive string match to "dark".  If dark spectrum
                     not found, will print warning.  If multiple darks found, will raise error.
      
       skiphead/skipfoot: Mostly for reminder that this filetype has a 17 line header and a 1 line footer.
       
    Notes
    -----
        Built to work with 2-column data only!!!

        Dataframe is constructed from a list of dictionaries.
        Each dataframe gets an appended headerdata attribute (dataframe.headerdata) which is a dictionary,
        keyed by columns and stores (infile, header, footer) data so no info is lost between files.

        Constructed to work for non-equally spaced datafiles, or non-identical data (aka wavelengths can have nans).
    '''

    dict_of_series = {}  #Dict of series eventually merged to dataframe
    time_file_dict = {
    }  #Dict of time:filename (darkfile intentionally excluded)

    _overlap_count = 0  # Tracks if overlapping occurs

    ### If looking for a darkfile, this will find it.  Bit redundant but I'm lazy..###
    if extract_dark:
        darkfile = extract_darkfile(file_list, return_null=True)

        if darkfile:
            with open(darkfile) as f:
                header = [f.next().strip() for x in xrange(skiphead)]

            wavedata = np.genfromtxt(darkfile,
                                     dtype=spec_dtype,
                                     skip_header=skiphead,
                                     skip_footer=skipfoot)
            darktime = _get_datetime_specsuite(header)
            baseline = Series(wavedata['intensity'],
                              index=wavedata['wavelength'],
                              name=darkfile)

            file_list.remove(darkfile)
            f.close()
        else:
            baseline = None

    file_list = [f for f in file_list if os.path.basename(f) != '.gitignore']

    for infile in file_list:

        ###Read in only the header lines, not all the lines of the file
        ###Strips and splits in one go
        with open(infile) as f:
            header = [f.next().strip() for x in xrange(skiphead)]

        #Store wavelength, intensity data in a 2-column datatime for easy itemlookup
        #Eg wavedata['wavelength']
        wavedata = np.genfromtxt(infile,
                                 dtype=spec_dtype,
                                 skip_header=skiphead,
                                 skip_footer=skipfoot)

        # Extract time data from header
        datetime = _get_datetime_specsuite(header)

        if datetime in time_file_dict:
            _overlap_count += 1

        # Make sure timepoints aren't overlapping with any others
        if check_for_overlapping_time and _overlap_count:
            raise IOError(
                'Duplicate time %s found in between files %s, %s.'
                ' To overwrite, set check_for_overlapping_time = False.' %
                (datetime, infile, time_file_dict[datetime]))

        time_file_dict[datetime] = infile
        dict_of_series[datetime] = Series(wavedata['intensity'],
                                          index=wavedata['wavelength'])

        f.close()

    ### Make timespec, add filenames, baseline and metadata attributes (note, DateTimeIndex auto sorts!!)
    timespec = TimeSpectra(
        DataFrame(dict_of_series),
        name=name)  #Dataframe beacuse TS doesn't handle dict of series
    timespec.specunit = 'nm'
    timespec.filedict = time_file_dict
    timespec.baseline = baseline  #KEEP THIS AS DARK SERIES RECALL IT IS SEPARATE FROM reference OR REFERENCE..

    ### Take metadata from first file in filelist that isn't darkfile
    for infile in file_list:
        if infile != darkfile:
            with open(infile) as f:
                header = [f.next().strip() for x in xrange(skiphead)]
            meta_partial = _get_metadata_fromheader(header)
            break

    meta_general = get_headermetadata_dataframe(timespec, time_file_dict)
    meta_general.update(meta_partial)
    timespec.metadata = meta_general

    if _overlap_count:
        logger.warn(
            'Time duplication found in %s of %s files.  Duplicates were '
            'removed!' % (_overlap_count, len(file_list)))

    return timespec