def abstracts_to_vector(abstracts, word_base): print 'converting abstracts...' cnt = 0 word_base_dict = {} for word in word_base: word_base_dict[word] = cnt cnt += 1 cnt = 0 fish = ProgressFish(total=len(abstracts)) for key, abstract in abstracts.items(): vector_abstract = abstract_to_vector(abstract, word_base_dict) abstracts[key] = vector_abstract cnt += 1 fish.animate(amount=cnt) return abstracts
def main(config_path, desc_path, target_path): massaged = io.StringIO() with io.open(config_path, 'rU') as infile: massaged.writelines(line.lstrip() for line in infile) massaged.seek(0) config = RawConfigParser() config.readfp(massaged) lfs_url = config.get('lfs', 'url').strip('"') api_url = posixpath.join(lfs_url, 'objects', 'batch') with io.open(desc_path, 'rU') as infile: target = dict(line.strip().partition(' ')[::2] for line in infile) if target.get('version') != 'https://git-lfs.github.com/spec/v1': raise ValueError("can't handle lfs", target['version']) oid_type, sep, oid = target['oid'].partition(':') if oid_type != 'sha256': raise ValueError("can't handle oid", target['oid']) size = int(target['size']) sys.stderr.write('Fetching {!r} from lfs...\n'.format( os.path.basename(target_path))) try: infile = open(target_path, 'rb') except IOError as e: if e.errno != errno.ENOENT: raise else: if file_matches(infile, size, oid): sys.stderr.write('Lucky! It was already up to date.\n') return req = Request(api_url, json.dumps({ 'operation': 'download', 'objects': [{ 'oid': oid, 'size': size, }], }).encode(), { 'Accept': JSON_TYPE, 'Content-Type': JSON_TYPE, }) with contextlib.closing(urlopen(req)) as respfile: if WRAP_RESPFILE: respfile = io.TextIOWrapper(respfile) resp = json.load(respfile) url = next(obj['actions']['download']['href'] for obj in resp['objects'] if obj['oid'] == oid) with contextlib.closing(urlopen(url)) as respfile: hasher = hashlib.sha256() with tempfile.NamedTemporaryFile( dir=os.path.dirname(target_path)) as outfile: fish = ProgressFish(total=size) fetched = 0 for chunk in iter(lambda: respfile.read(8192), b''): fetched += len(chunk) fish.animate(amount=fetched) hasher.update(chunk) outfile.write(chunk) if hasher.hexdigest() != oid: raise ValueError('hash failure', hasher.hexdigest(), oid) os.rename(outfile.name, target_path) open(outfile.name, 'w').close()
def analyze_long_pulse_data_file(filepath, save=0, plot_steps=0, new=1, starttime=0, endtime=0): """ analyzes timeseries of a pulse fish EOD recording """ # Script to detect and classify EODs in recordings of weakly electric pulse # fish, Dexter Früh, 2018 # # results will be saved in workingdirectory/recording/ # # input: # - [Recorded Timeseries] recording.WAV # outputs(optional): # - [Detected and Classified EODs] # (Numpy Array with Shape (Number of EODs, 4 (Attributes of EODs)), # with the EOD-Attributes # - x-location of the EOD # (time/x-coordinate/datapoint in recording) # - y-location of the EOD # (Amplitude of the positive peak of the pulse-EOD) # - height of the EOD(largest distance between peak and through in the EOD) # - class of the EOD # eods_recording.npy # - [plots of the results of each analyse step for each # analysepart (timeinterval of length = deltat) of the recording] # # required command line arguments at function call # - save : if True, save the results to a numpy file (possibly # overwrite existing) # - plot : if True, plot results in each analysestep # - new : if True, do a new analysis of the recording, even if there # is an existing analyzed .npy file with the right name. # import sys import numpy as np import copy from scipy.stats import gmean from scipy import stats from scipy import signal from scipy import optimize import matplotlib from fish import ProgressFish import matplotlib.pyplot as plt from thunderfish.dataloader import open_data from thunderfish.peakdetection import detect_peaks from scipy.interpolate import interp1d from scipy.signal import savgol_filter from collections import deque import ntpath import nixio as nix import time import os from shutil import copy2 from ownDataStructures import Peak, Tr, Peaklist import DextersThunderfishAddition as dta from IPython import embed # parameters for the analysis deltat = 30.0 # seconds of buffer size thresh = 0.04 # minimal threshold for peakdetection peakwidth = 20 # width of a peak and minimal distance between two EODs # basic parameters for thunderfish.dataloader.open_data verbose = 0 channel = 0 ultimate_threshold = thresh + 0.01 startblock = 0 # timeinterval to analyze other than the whole recording #starttime = 0 #endtime = 0 #timegiven = 0 home = os.path.expanduser('~') os.chdir(home) new = int(sys.argv[4]) save = int(sys.argv[2]) plot = int(sys.argv[3]) starttime = int(starttime) endtime = int(endtime) timegiven = False if endtime > starttime >= 0: timegiven = True peaks = np.array([]) troughs = np.array([]) filename = path_leaf(filepath) datasavepath = filename[:-4] proceed = input( 'Currently operates in home directory. If given a pulsefish recording filename.WAV, then a folder filename/ will be created in the home directory and all relevant files will be stored there. continue? [y/n] ' ).lower() if proceed != 'y': quit() if not os.path.exists(datasavepath): os.makedirs(datasavepath) if save == 1: print('files will be saved to: ', datasavepath) eods_len = 0 # starting analysis if new == 1 or not os.path.exists(filename[:-4] + "/eods5_" + filename[:-3] + "npy"): if filepath != home + '/' + datasavepath + '/' + filename: print(filepath, datasavepath + '/' + filename) proceed = input( 'Copy datafile to ' + datasavepath + ' where all the other files will be stored? [y/n] ').lower() if proceed == 'y': copy2(filepath, datasavepath) # import data with open_data(filepath, channel, deltat, 0.0, verbose) as data: samplerate = data.samplerate nblock = int(deltat * data.samplerate) # selected time interval if timegiven == True: parttime1 = starttime * samplerate parttime2 = endtime * samplerate data = data[parttime1:parttime2] #split data into blocks if len(data) % nblock != 0: blockamount = len(data) // nblock + 1 else: blockamount = len(data) // nblock # progress bar print('blockamount: ', blockamount) progress = 0 print(progress, '%', flush=True, end=" ") fish = ProgressFish(total=blockamount) # blockwise analysis for idx in range(0, blockamount): blockdata = data[idx * nblock:(idx + 1) * nblock] # progressbar if progress < (idx * 100 // blockamount): progress = (idx * 100) // blockamount progressstr = ' Filestatus: ' fish.animate(amount=idx, dexextra=progressstr) #---analysis----------------------------------------------------------------------- # step1: detect peaks in timeseries pk, tr = detect_peaks(blockdata, thresh) troughs = tr # continue with analysis only if multiple peaks are detected if len(pk) > 3: peaks = dta.makeeventlist(pk, tr, blockdata, peakwidth) #dta.plot_events_on_data(peaks, blockdata) peakindices, peakx, peakh = dta.discardnearbyevents( peaks[0], peaks[1], peakwidth) peaks = peaks[:, peakindices] if len(peaks) > 0: # used to connect the results of the current block with the previous if idx > startblock: peaklist = dta.connect_blocks(peaklist) else: peaklist = Peaklist([]) aligned_snips = dta.cut_snippets(blockdata, peaks[0], 15, int_met="cubic", int_fact=10, max_offset=1.5) pcs = dta.pc( aligned_snips) #pc_refactor(aligned_snips) order = 5 minpeaks = 3 if deltat < 2 else 10 labels = dta.cluster_events(pcs, peaks, order, 0.4, minpeaks, False, method='DBSCAN') peaks = np.append(peaks, [labels], axis=0) #dta.plot_events_on_data(peaks, blockdata) num = 1 if idx > startblock: dta.alignclusterlabels(labels, peaklist, peaks, data=blockdata) peaks, peaklist = dta.ampwalkclassify3_refactor( peaks, peaklist) # classification by amplitude minlen = 6 # >=1 peaks = dta.discard_short_classes(peaks, minlen) if len(peaks[0]) > 0: peaks = dta.discard_wave_pulses(peaks, blockdata) # plots the data part and its detected and classified peaks if plot_steps == True: dta.plot_events_on_data(peaks, blockdata) pass worldpeaks = np.copy(peaks) # change peaks location in the buffered part to the location relative to the peaklist.len = nblock # peaklocations relative to whole recording worldpeaks[0] = worldpeaks[0] + (idx * nblock) thisblock_eods = np.delete(peaks, 3, 0) # save the peaks of the current buffered part to a numpy-memmap on the disk mmpname = "eods_" + filename[:-3] + "npmmp" save_EOD_events_to_npmmp(thisblock_eods, eods_len, idx == startblock, datasavepath, mmpname) eods_len += len(thisblock_eods[0]) # after the last buffered part has finished, save the memory mapped # numpy file of the detected and classified EODs to a .npy file to the # disk eods = np.memmap(datasavepath + "/eods_" + filename[:-3] + "npmmp", dtype='float64', mode='r+', shape=(4, eods_len), order='F') if save == 1: path = datasavepath + "/" if not os.path.exists(path): os.makedirs(path) if eods_len > 0: print('Saved!') np.save(datasavepath + "/eods8_" + datasavepath + "npy", eods) else: #np.save(filename[:-4]+"/eods5_"+filename[:-3]+"npy", thisblock_eods) print('not saved') else: # if there already has been a certain existing result file and 'new' was set to False print('already analyzed') print( 'returnes analyzed EODS. Calculate frequencies using all of these but discard the data from the EODS within the lowest few percent of amplitude' ) return eods
x = float(x) except ValueError as e: return -1 else: try: x = mapping[x] except KeyError as e: # print "skipping due to not in mapping" return -1 return x num = coll.count() if coll.count() < LIMIT else LIMIT X = np.zeros((num, 266)) Y = np.zeros((num, 183)) data = coll.find().limit(LIMIT) fish = ProgressFish(total=LIMIT) for i, record in enumerate(data): fish.animate(amount=i) X[i] = map(x_to_num, enumerate(record['x'])) Y[i] = map(y_to_num, record['y']) data_x = open("DATA_X", "w") cPickle.dump(X, data_x) data_x.close() data_y = open("DATA_Y", "w") cPickle.dump(Y, data_y) data_y.close()
def analyze_pulse_data(filepath, absolutepath=True, deltat=30, thresh=0.04, starttime=0, endtime=0, savepath=False, save=False, npmmp=False, plot_steps=False, plot_result=False): ''' analyzes timeseries of a pulse fish EOD recording Parameters ---------- filepath: WAV-file with the recorded timeseries deltat: int, optional time for a single analysisblock (recommended less than a minute, due to principal component clustering on the EOD-waveforms) thresh: float, optional minimum threshold for the peakdetection (if computing frequencies recommended a tiny bit lower than the wished threshold, and instead discard the EOD below the wished threshold after computing the frequencies for each EOD.) starttime: int or, str of int, optional time into the data from where to start the analysis, seconds. endtime: int or str of int, optional time into the data where to end the analysis, seconds, larger than starttime. savepath = Boolean or str, optional path to where to save results and intermediate result, only needed if save or npmmp is True. string to specify a relative path to the directory where results and intermediate results will bed or False to use preset savepath, which is ~/filepath/ or True to specify savepath as input when the script is running save: Boolean, optional True to save the results into a npy file at the savepath npmmp: Boolean, optional True to save intermediate results into a npmmp at the savepath, only recommended in case of memory overflow plot_steps: Boolean, optional True to plot the results of each analysis block plot_results: Boolean, optional True to plot the results of the final analysis. Not recommended for long recordings due to %TODO Returns ------- eods: numpy array 2D numpy array. first axis: attributes of an EOD (x (datapoints), y (recorded voltage), height (difference from maximum to minimum), class), second axis: EODs in chronological order. ''' import sys import numpy as np import copy from scipy.stats import gmean from scipy import stats from scipy import signal from scipy import optimize import matplotlib from fish import ProgressFish import matplotlib.pyplot as plt from thunderfish.dataloader import open_data from thunderfish.peakdetection import detect_peaks from scipy.interpolate import interp1d from scipy.signal import savgol_filter from collections import deque import ntpath import nixio as nix import time import os from shutil import copy2 from ownDataStructures import Peak, Tr, Peaklist import DextersThunderfishAddition as dta from IPython import embed # parameters for the analysis thresh = 0.04 # minimal threshold for peakdetection peakwidth = 20 # width of a peak and minimal distance between two EODs # basic parameters for thunderfish.dataloader.open_data verbose = 0 channel = 0 ultimate_threshold = thresh + 0.01 startblock = 0 # timeinterval to analyze other than the whole recording #starttime = 0 #endtime = 0 #timegiven = 0 home = os.path.expanduser('~') if absolutepath: filepath = home + '/' + filepath #os.chdir(home) #save = int(save) #plot_steps = int(plot_steps) starttime = int(starttime) endtime = int(endtime) timegiven = False if endtime > starttime >= 0: timegiven = True peaks = np.array([]) troughs = np.array([]) filename = path_leaf(filepath) eods_len = 0 if savepath == False: datasavepath = home + '/' + filename[:-4] elif savepath == True: datasavepath = input( 'With the option npmmp enabled, a numpy memmap will be saved to: ' ).lower() else: datasavepath = savepath if save and ( os.path.exists(datasavepath + "/eods8_" + filename[:-3] + "npy") or os.path.exists(datasavepath + "/eods5_" + filename[:-3] + "npy")): print( 'there already exists an analyzed file, aborting. Change the code if you don\'t want to abort' ) quit() if npmmp: #proceed = input('With the option npmmp enabled, a numpy memmap will be saved to ' + datasavepath + '. continue? [y/n] ').lower() proceed = 'y' if proceed != 'y': quit() # starting analysis with open_data(filepath, channel, deltat, 0.0, verbose) as data: samplerate = data.samplerate # selected time interval if timegiven == True: parttime1 = starttime * samplerate parttime2 = endtime * samplerate data = data[parttime1:parttime2] #split data into blocks nblock = int(deltat * samplerate) if len(data) % nblock != 0: blockamount = len(data) // nblock + 1 else: blockamount = len(data) // nblock print('blockamount: ', blockamount) progress = 0 print(progress, '%', flush=True, end=" ") fish = ProgressFish(total=blockamount) for idx in range(0, blockamount): blockdata = data[idx * nblock:(idx + 1) * nblock] if progress < (idx * 100 // blockamount): progress = (idx * 100) // blockamount progressstr = ' Filestatus: ' fish.animate(amount=idx, dexextra=progressstr) pk, tr = detect_peaks(blockdata, thresh) troughs = tr if len(pk) > 3: peaks = dta.makeeventlist(pk, tr, blockdata, peakwidth) peakindices, peakx, peakh = dta.discardnearbyevents( peaks[0], peaks[1], peakwidth) peaks = peaks[:, peakindices] if len(peaks) > 0: if idx > startblock: peaklist = dta.connect_blocks(peaklist) else: peaklist = Peaklist([]) aligned_snips = dta.cut_snippets(blockdata, peaks[0], 15, int_met="cubic", int_fact=10, max_offset=1.5) pcs = dta.pc(aligned_snips) #pc_refactor(aligned_snips) order = 5 minpeaks = 3 if deltat < 2 else 10 labels = dta.cluster_events(pcs, peaks, order, 0.4, minpeaks, False, method='DBSCAN') peaks = np.append(peaks, [labels], axis=0) #dta.plot_events_on_data(peaks, blockdata) num = 1 if idx > startblock: dta.alignclusterlabels(labels, peaklist, peaks, data=blockdata) peaks, peaklist = dta.ampwalkclassify3_refactor( peaks, peaklist) # classification by amplitude minlen = 6 peaks = dta.discard_short_classes(peaks, minlen) if len(peaks[0]) > 0: peaks = dta.discard_wave_pulses(peaks, blockdata) if plot_steps == True: dta.plot_events_on_data(peaks, blockdata) pass peaklist.len = nblock worldpeaks = np.copy(peaks) worldpeaks[0] = worldpeaks[0] + (idx * nblock) thisblock_eods = np.delete(worldpeaks, 3, 0) if npmmp: if idx == startblock: if not os.path.exists(datasavepath): os.makedirs(datasavepath) mmpname = "eods_" + filename[:-3] + "npmmp" # save the peaks of the current buffered part to a numpy-memmap on the disk save_EOD_events_to_npmmp(thisblock_eods, eods_len, idx == startblock, datasavepath, mmpname) eods_len += len(thisblock_eods[0]) else: if idx > 0: all_eods = np.concatenate( (all_eods, thisblock_eods), axis=1) else: all_eods = thisblock_eods #dta.plot_events_on_data(all_eods,data) print( 'returnes analyzed EODS. Calculate frequencies using all of these but discard the data from the EODS within the lowest few percent of amplitude' ) if npmmp: all_eods = np.memmap(datasavepath + '/' + mmpname, dtype='float64', mode='r+', shape=(4, eods_len), order='F') if save == 1: path = filename[:-4] + "/" if not os.path.exists(path): os.makedirs(path) if eods_len > 0: np.save(datasavepath + "/eods8_" + filename[:-3] + "npy", all_eods) print('Saved!') else: print('not saved') return all_eods