Exemple #1
0
def from_gwu_chem_UVVIS(filelist, sortnames=False, shortname=True, cut_extension=False, name=''):
    ''' Format for comma delimited two column data from GWU chemistry's UVVis.  These have no useful metadata
    or dark data and so it is important that users either pass in a correctly sorted filelist.  Once the 
    dataframe is created, on can do df=df.reindex(columns=[correct order]).  
    
    It uses read_csv() to and creates a list of dataframes.  Afterwards, concat() merges these.
    
    Kwds:
       sortnames- Will attempt to autosort the filelist. Otherwise, order of files passed in is
                  directly used as columns.
       shortname- If false, full file path is used as the column name.  If true, only the filename is used. 
       
       cut_extension- If using the shortname, this will determine if the file extension is saved or cut from the data.'''

    if shortname:
        fget=lambda x:get_shortname(x, cut_extension=cut_extension)
    else:
        fget=lambda x: x
    
    ### Either full names or short names of filelist    
    working_names=[fget(afile) for afile in filelist]
        

    dflist=[read_csv(afile, sep=',', header=None, index_col=0, skiprows=2, na_values=' ',  #Used to be ' \r', or is this from IR?
                               names=[fget(afile)]) for afile in filelist]
    
    ### THIS IS BUSTED, PUTTING NANS EVERYWHERE EXCEPT ONE FILE, but dflist itself ws nice.
    dataframe=concat(dflist, axis=1)
                        
    ### concat tries to sort these, so this will preserve the sort order
    if sortnames:
        dataframe=dataframe.reindex(columns=sorted(working_names))

    dataframe=TimeSpectra(dataframe) #this is fine

    dataframe.metadata=None
    dataframe.filedict=None
    dataframe.baseline=None
    dataframe.specunit='nm' #This autodetected in plots    
    if name:
        dataframe.name=name
    
    return dataframe
def from_spec_files(file_list, specframe=None, skiphead=17, skipfoot=1,\
                    check_for_overlapping_time=True, extract_dark=True, name=None):
    ''' Takes in raw files directly from Ocean optics spectrometer and creates a spectral dataframe.   
    This is somewhat customized to my analysis, but has general arguments for future adaptations.

    Built to work with 2-column data only!!!

    Dataframe is constructed from a list of dictionaries.
    Each dataframe gets an appended headerdata attribute (dataframe.headerdata) which is a dictionary,
    keyed by columns and stores (infile, header, footer) data so no info is lost between files.

    Constructed to work for non-equally spaced datafiles, or non-identical data (aka wavelengths can have nans).
    **kwargs
       - check_for_overlapping will raise errors if any files have identical times. Otherwise, time
       is overwritten.  Really only useful for testing or otherwise cornercase instances.
       - Extract dark will attempt to find a filename with caseinsenstive string match to "dark".  If dark
       not found, will print warning.  If multiple darks found, will raise error.
       -skiphead/skipfoot are mostly for reminder that this filetype has a 17 line header and a 1 line footer.'''


    ### Need to add functionality for spectral dataframe (join/concat)
    if specframe:
        raise NotImplemented

    dict_of_series={} #Dict of series eventually merged to dataframe   
    time_file_dict={} #Dict of time:filename (darkfile intentionally excluded)

    ### If looking for a darkfile, this will find it.  Bit redundant but I'm lazy..###
    if extract_dark:
        darkfile=extract_darkfile(file_list, return_null=True)

        if darkfile:
            with open(darkfile) as f:
                header=[f.next().strip() for x in xrange(skiphead)]

            wavedata=np.genfromtxt(darkfile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) 
            darktime=_get_datetime_specsuite(header)        
            darkseries=Series(wavedata['intensity'], index=wavedata['wavelength'], name=darkfile)

            file_list.remove(darkfile)
            f.close()
        else:
            darkseries=None

    for infile in file_list:

        ###Read in only the header lines, not all the lines of the file
        ###Strips and splits in one go
        with open(infile) as f:
            header=[f.next().strip() for x in xrange(skiphead)]

        ###Store wavelength, intensity data in a 2-column datatime for easy itemlookup 
        ###Eg wavedata['wavelength']
        wavedata=np.genfromtxt(infile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) 

        ### Extract time data from header
        datetime=_get_datetime_specsuite(header) 

        ### Make sure timepoints aren't overlapping with any others
        if check_for_overlapping_time:
            try:
                time_file_dict[datetime]
            except KeyError:
                pass
            else:
                raise KeyError('Duplicate time %s found in between files %s, %s'\
                               %(datetime,infile, time_file_dict[datetime]) )

        time_file_dict[datetime]=infile
        dict_of_series[datetime]=Series(wavedata['intensity'], index=wavedata['wavelength'])

        f.close()

    ### Make dataframe, add filenames, darkseries and metadata attributes (note, DateTimeIndex auto sorts!!)
    dataframe=TimeSpectra(dict_of_series)
    dataframe.specunit='nm'
    dataframe.filedict=time_file_dict
    dataframe.darkseries=darkseries  #KEEP THIS AS DARK SERIES RECALL IT IS SEPARATE FROM BASELINE OR REFERENCE..
    dataframe.name=name    

    ### Take metadata from first file in filelist that isn't darkfile
    for infile in file_list:
        if infile==darkfile:
            pass
        else:
            with open(infile) as f:
                header=[f.next().strip() for x in xrange(skiphead)]         
            meta_partial=_get_metadata_fromheader(header)
            break      

    meta_general=get_headermetadata_dataframe(dataframe, time_file_dict) 
    meta_general.update(meta_partial)
    dataframe.metadata=meta_general   


    return dataframe
def from_timefile_datafile(datafile, timefile, extract_dark=True, name=None): 
    ''' Converts old-style spectral data from GWU phys lab into  
    a dataframe with timestamp column index and wavelength row indicies.

    Creates the DataFrame from a dictionary of Series, keyed by datetime.
    **name becomes name of dataframe''' 

    tlines=open(timefile,'r').readlines()
    tlines=[line.strip().split() for line in tlines]           
    tlines.pop(0)

    time_file_dict=dict((_get_datetime_timefile(tline),tline[0]) for tline in tlines)

    ### Read in data matrix, separate first row (wavelengths) from the rest of the data
    wavedata=np.genfromtxt(datafile, dtype='float', skip_header=1)
    data, wavelengths=wavedata[:,1::], wavedata[:,0] #Separate wavelength column

    ### Sort datetimes here before assigning/removing dark spec etc...
    sorted_tfd=sorted(time_file_dict.items())
    sorted_times, sorted_files=zip(*( (((i[0]), (i[1])) for i in sorted_tfd)))

    ### Seek darkfile.  If found, take it out of dataframe. ###
    if extract_dark:
        darkfile=extract_darkfile(sorted_files, return_null=True)   

    if darkfile:    
        ####Fine darkseries by reverse lookup (lookup by value) and get index position

        #darkindex, darktime=[(idx, time) for idx, (time, afile) in enumerate(sorted_tfd) if afile == darkfile][0]
        darkindex=sorted_files.index(darkfile)
        darktime=sorted_times[darkindex]
        darkseries=Series(data[:,darkindex], index=wavelengths, name=darkfile) 


        del time_file_dict[darktime] #Intentionally remove
        sorted_times=list(sorted_times) #Need to do in two steps
        sorted_times.remove(darktime)
        data=np.delete(data, darkindex, 1)  #Delete dark column from numpy data           
    else:
        darkseries=None

    dataframe=TimeSpectra(data, columns=sorted_times, index=wavelengths)      
    


    ### Add field attributes to dataframe
    dataframe.darkseries=darkseries 
    dataframe.filedict=time_file_dict
    dataframe.name=name

    ### Get headermeta data from first line in timefile that isn't darkfile.  Only checks one line
    ### Does not check for consistency
    for line in tlines:
        if line[0]==darkfile:
            pass
        else:
            meta_partial=_get_headermetadata_timefile(line[0])  #DOUBLE CHECK THIS WORKS
            break   

    ### Extract remaining metadata (file/time info) and return ###
    meta_general=get_headermetadata_dataframe(dataframe, time_file_dict) 
    meta_general.update(meta_partial)
    dataframe.metadata=meta_general
    dataframe.specunit='nm'  #This autodetected in plots    

    ### Sort dataframe by ascending time (could also sort spectral data) ###
    dataframe.sort(axis=1, inplace=True) #axis1=columns

    return dataframe
Exemple #4
0
def from_spec_files(file_list, name='', skiphead=17, skipfoot=1, check_for_overlapping_time=True, extract_dark=True):
    ''' Takes in raw files directly from Ocean optics USB2000 and USB650 spectrometers and returns a
    pyuvvis TimeSpectra. If spectral data stored without header, can be called with skiphead=0.

    Parameters
    ----------
       name: Set name of returned TimeSpectra.
       
       check_for_overlapping_time: will raise errors if any files have identical times. Otherwise, time
                                   is overwritten.  Really only useful for testing or otherwise cornercase instances.
       
       extract_dark: Attempt to find a filename with caseinsenstive string match to "dark".  If dark spectrum
                     not found, will print warning.  If multiple darks found, will raise error.
      
       skiphead/skipfoot: Mostly for reminder that this filetype has a 17 line header and a 1 line footer.
       
    Notes
    -----
        Built to work with 2-column data only!!!

        Dataframe is constructed from a list of dictionaries.
        Each dataframe gets an appended headerdata attribute (dataframe.headerdata) which is a dictionary,
        keyed by columns and stores (infile, header, footer) data so no info is lost between files.

        Constructed to work for non-equally spaced datafiles, or non-identical data (aka wavelengths can have nans).
    '''

    dict_of_series={} #Dict of series eventually merged to dataframe   
    time_file_dict={} #Dict of time:filename (darkfile intentionally excluded)
    
    _overlap_count = 0 # Tracks if overlapping occurs

    ### If looking for a darkfile, this will find it.  Bit redundant but I'm lazy..###
    if extract_dark:
        darkfile=extract_darkfile(file_list, return_null=True)

        if darkfile:
            with open(darkfile) as f:
                header=[f.next().strip() for x in xrange(skiphead)]

            wavedata=np.genfromtxt(darkfile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) 
            darktime=_get_datetime_specsuite(header)        
            baseline=Series(wavedata['intensity'], index=wavedata['wavelength'], name=darkfile)

            file_list.remove(darkfile)
            f.close()
        else:
            baseline=None
            
    file_list = [f for f in file_list 
                 if os.path.basename(f) != '.gitignore']

    for infile in file_list:

        ###Read in only the header lines, not all the lines of the file
        ###Strips and splits in one go
        with open(infile) as f:
            header=[f.next().strip() for x in xrange(skiphead)]

        #Store wavelength, intensity data in a 2-column datatime for easy itemlookup 
        #Eg wavedata['wavelength']
        wavedata=np.genfromtxt(infile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) 

        # Extract time data from header
        datetime=_get_datetime_specsuite(header) 
        
        if datetime in time_file_dict:
            _overlap_count += 1        

        # Make sure timepoints aren't overlapping with any others
        if check_for_overlapping_time and _overlap_count:
            raise IOError('Duplicate time %s found in between files %s, %s.'
                          ' To overwrite, set check_for_overlapping_time = False.'
                           %( datetime, infile, time_file_dict[datetime] ))            
            

        time_file_dict[datetime]=infile
        dict_of_series[datetime]=Series(wavedata['intensity'], index=wavedata['wavelength'])

        f.close()

    ### Make timespec, add filenames, baseline and metadata attributes (note, DateTimeIndex auto sorts!!)
    timespec=TimeSpectra(DataFrame(dict_of_series), name=name) #Dataframe beacuse TS doesn't handle dict of series
    timespec.specunit='nm'
    timespec.filedict=time_file_dict
    timespec.baseline=baseline  #KEEP THIS AS DARK SERIES RECALL IT IS SEPARATE FROM reference OR REFERENCE..  

    ### Take metadata from first file in filelist that isn't darkfile
    for infile in file_list:
        if infile != darkfile:
            with open(infile) as f:
                header=[f.next().strip() for x in xrange(skiphead)]         
            meta_partial=_get_metadata_fromheader(header)
            break      

    meta_general=get_headermetadata_dataframe(timespec, time_file_dict) 
    meta_general.update(meta_partial)
    timespec.metadata=meta_general   

    if _overlap_count:
        logger.warn('Time duplication found in %s of %s files.  Duplicates were '
            'removed!' % (_overlap_count, len(file_list)))
            
    return timespec
Exemple #5
0
def from_timefile_datafile(datafile, timefile, extract_dark=True, name=''): 
    ''' Converts old-style spectral data from GWU phys lab into  
    a dataframe with timestamp column index and wavelength row indicies.

    Creates the DataFrame from a dictionary of Series, keyed by datetime.
    **name becomes name of dataframe''' 

    tlines=open(timefile,'r').readlines()
    tlines=[line.strip().split() for line in tlines]           
    tlines.pop(0)

    time_file_dict=dict((_get_datetime_timefile(tline),tline[0]) for tline in tlines)

    ### Read in data matrix, separate first row (wavelengths) from the rest of the data
    wavedata=np.genfromtxt(datafile, dtype='float', skip_header=1)
    data, wavelengths=wavedata[:,1::], wavedata[:,0] #Separate wavelength column

    ### Sort datetimes here before assigning/removing dark spec etc...
    sorted_tfd=sorted(time_file_dict.items())
    sorted_times, sorted_files=zip(*( (((i[0]), (i[1])) for i in sorted_tfd)))

    ### Seek darkfile.  If found, take it out of dataframe. ###
    if extract_dark:
        darkfile=extract_darkfile(sorted_files, return_null=True)   

    if darkfile:    
        ####Find baseline by reverse lookup (lookup by value) and get index position

        #darkindex, darktime=[(idx, time) for idx, (time, afile) in enumerate(sorted_tfd) if afile == darkfile][0]
        darkindex=sorted_files.index(darkfile)
        darktime=sorted_times[darkindex]
        baseline=Series(data[:,darkindex], index=wavelengths, name=darkfile) 


        del time_file_dict[darktime] #Intentionally remove
        sorted_times=list(sorted_times) #Need to do in two steps
        sorted_times.remove(darktime)
        data=np.delete(data, darkindex, 1)  #Delete dark column from numpy data           
    else:
        baseline=None

    dataframe=TimeSpectra(data, columns=sorted_times, index=wavelengths)      
    

    ### Add field attributes to dataframe
    dataframe.baseline=baseline 
    dataframe.filedict=time_file_dict
    if name:
        dataframe.name=name

    ### Get headermeta data from first line in timefile that isn't darkfile.  Only checks one line
    ### Does not check for consistency
    for line in tlines:
        if line[0]==darkfile:
            pass
        else:
            meta_partial=_get_headermetadata_timefile(line[0])  #DOUBLE CHECK THIS WORKS
            break   

    ### Extract remaining metadata (file/time info) and return ###
    meta_general=get_headermetadata_dataframe(dataframe, time_file_dict) 
    meta_general.update(meta_partial)
    dataframe.metadata=meta_general
    dataframe.specunit='nm'  #This autodetected in plots    

    ### Sort dataframe by ascending time (could also sort spectral data) ###
    dataframe.sort(axis=1, inplace=True) #axis1=columns

    return dataframe