def is_in_path(sha): """Returns boolean Arguments: sha - the sha of the file """ files = get_files() for local_file in files: if local_file.get('sha') == sha: return True return False
def main(): opts, word, args = parse_options() filelist = get_files(args[0], opts.recurse) work_queue = queue.Queue() for i in range(opts.count): number = '{0}:'.format(i + 1) if opts.debug else "" worker = Worker(work_queue, word, number) worker.daemon = True worker.start() for filename in filelist: work_queue.put(filename) work_queue.join()
def download_links(channel_name, vods_clips, rename, try_muted, file_path, file_name_vods, file_name_clips): """Download clips and/or vods listed in the data file""" logger = logging.getLogger(__name__) abs_file_path = Path.resolve(Path.cwd() / file_path) if vods_clips == "vods" or vods_clips == "both": logger.info("starting vods download ...") get_files.get_files(file_name_vods, rename, "vods", try_muted, loglevel="INFO") logger.info( f"vods downloaded at '{abs_file_path / channel_name / 'vods'}'\n") if vods_clips == "clips" or vods_clips == "both": logger.info("starting clips download ...") get_files.get_files(file_name_clips, rename, "clips", loglevel="INFO") logger.info( f"clips downloaded at '{abs_file_path / channel_name / 'clips'}'\n" )
def __init__(self, *args): self.project = os.getenv('PROJECT','jay_files') self.workdir = os.getenv('DESIGN_DIR',os.path.dirname(os.path.abspath(sys.argv[0]))) self.cwp = os.path.dirname(os.path.abspath(os.path.realpath(sys.argv[0]))) self.filelist = self.project + ".list" self.filelist = os.path.join(self.cwp,self.filelist) self.file_search = get_files(list_name = self.filelist, path = self.workdir) self.database = self.project + ".db" self.database = os.path.join(self.cwp,self.database) if args: self.filetarget = args else: self.filetarget = ['c','h','v','vh','sv','pl','py','pm','vhd', 's', 'sh', 'arg'] print("target file type are : "+" ".join(self.filetarget)) print("target project is : " + self.project) print("target directory : "+self.workdir) print("current directory : "+self.cwp)
def sex_call(): """ Runs sextractor on all members of input list1. :param list1: list of fits/FITS files in cwd :return: False if subprocess fails, otherwise returns True """ # Set current working directory to /Users/jaredhand/Documents/photometry/data_in/ os.chdir(config.sex_directory) # str for sextractor script script = 'sex -CATALOG_NAME %s %s' files = get_files.get_files() if not files: print('No .fit or .FIT files in /data_in folder.') return False for i in files: cat_name = i.split('.')[0] + '.cat' try: sp.check_call(script % (cat_name, i), shell=True) except: raise return True
all_trials = np.delete(all_trials, erase_trials, axis=1) print 'shape of all_trials = ', all_trials.shape return all_trials if __name__ == "__main__": global fs fs = 30e3 #data = np.random.rand(18e4,10) #length = len(data) #time = np.linspace(-0.5*length/fs,0.5*length/fs,length) #time = downsample(time, 10) #plot_raw_traces(data,time,18,12,'./psth/channel_13') raw_files = get_files('kwd', os.getcwd()) # collect the kwd files print 'raw files = ', raw_files metadata = get_file_metadata(raw_files) # get their lengths + timestamps print 'metadata[data_len] = ', [meta['data_len'] for meta in metadata] print 'first = ', metadata[0]['data_len'] ######### !!!!! channel numbering starts with 0 !!!!!!!!!############### channels = np.array([15]) # np.arange(32) # newephys = concatenate_kwd(raw_files, channels) print 'newephys shape = ', newephys.shape stim_data = pd.read_csv( 'oe_stim_times.csv' ) # read the CSV file with stimulus times and orientations stim_data.times = np.rint(stim_data.times * fs)
parser.add_argument('--sets', choices=('all', 'train', 'valid', 'test'), help='Determine which sets from trian,valid and test') opts = parser.parse_args() ROOT_DIR = opts.root_dir ROUNDNAME = opts.round Round = int(ROUNDNAME[-1]) maxPoint = opts.maxPoint penality = opts.penality trainid = opts.trainid SET = opts.sets print('Report of ' + SET + 'set') train_sets, valid_sets, test_sets = get_files(ROOT_DIR, Round) if (SET == 'all'): sets = np.concatenate((train_sets, valid_sets, test_sets)) elif (SET == 'train'): sets = train_sets elif (SET == 'valid'): sets = valid_sets elif (SET == 'test'): sets = test_sets RealNum25 = 0 RealNum50 = 0 RealNum100 = 0 ground_truth = 0
if update and ((not search_dir) or (not ana_type)): raise ValueError("Need directory and ana_type for Update") if (not update) and ((not search_type) or (not target_name)): raise ValueError("Need search_type and target_name") if ana_type == "sv": file_type=("sv","svh") table_name = "classes" ana_len = 3 else: raise ValueError("wait for adding") if update: if not os.path.isdir(search_dir): raise ValueError("Illegal Directory") jay_get_files = get_files(list_name = temp_file) jay_get_files.search_files(*file_type, path = search_dir ) db_file = os.path.join(work_dir,"db",db_files[0]) jay_analysis_files = analysis_files(output_db = db_file) jay_analysis_files.update_db.set_verbose() jay_analysis_files.update_db(arg_table_name = table_name, arg_ana_type = ana_type, arg_file_list = temp_file, arg_ana_len = ana_len) else: jay_search_item = search_item() for db_i in db_files: db_file = os.path.join(work_dir,"db",db_i) jay_search_item.config(target_type = ana_type, db = db_file, table = table_name, level = 1, open_result = True) jay_search_item.patch_search(target_name, target_type = ana_type, search_type = search_type) if jay_search_item.hit: break
def find_foreign_keys(data_source, data_sample): foreign_keys = {} # get information needed to read files dat_path, dat_source, sample_data, files = gf.get_files( data_source, data_sample) json_path = os.path.join(os.path.normpath(dat_path + os.sep + os.pardir), 'results') json_name_primary = data_sample + '_primarykeys' + '.json' # get the primary keys from the json file created with the primary key detection module with open(os.path.join(json_path, json_name_primary)) as file: primary_keys = json.load(file) # extract the primary keys (files with primary keys) keys_files = list(primary_keys.keys()) # loop through the files that do not contain primary keys for f in list(files): if f not in keys_files: dat_foreign = pd.read_csv(os.path.join(sample_data, f), na_values=missing_values, low_memory=False, encoding='latin1') # get the columns columns = dat_foreign.columns # extract the primary key values from the primary key dictionary for key in keys_files: key_files_columns = primary_keys[key].keys() for key_column in key_files_columns: temp_primary_keys = primary_keys[key][key_column] # loop through the columns and get unique values: for col in columns: temp_foreign_keys = dat_foreign[col].unique() # find the proportion of potential foreign key values that are in the list of primary key values foreign_key_bool = set(temp_foreign_keys).intersection( temp_primary_keys) prop_foreign = len(foreign_key_bool) / len( temp_primary_keys) # if the proportion > 0, we can infer foreign key if prop_foreign > 0: if f not in foreign_keys.keys(): foreign_keys[f] = {} foreign_keys[f][col] = prop_foreign else: foreign_keys[f][col] = prop_foreign # create the name for the json file json_name_foreign = data_sample + '_foreignkeys' + '.json' # write the json file with open(os.path.join(json_path, json_name_foreign), 'w') as file: json.dump(foreign_keys, file, cls=ne.NumpyEncoder, indent=2) # return the foreign keys return foreign_keys
import open_tropomi as ot import get_files as gf import numpy as np import xarray as xr import pandas as pd import time # VARIABLES year = 2020 # month = 5 # day = 5 week_num = 17 ################## # Load files start, end, calendar_week = gf.get_files(year=year, calendar_week=week_num) try: file_list=open('inventory.txt','r') except: print('Did not find a text file containing file names (perhaps name does not match)') sys.exit() ds_list = [] startiest_time = time.time() for test_file in file_list: test_file = test_file.strip() start_time = time.time() ds_list.append(ot.dsread(test_file))
for idx, unique_exp in enumerate(uni_exps): if idx > 0: spike_times[all_exps == unique_exp] += experiment_lengths[idx] print 'spikes shape = ', spikes[clust_name]['times'][0][0][0][0].shape print 'uni times = ', spikes[clust_name]['times'][0][0][0][0][ 0, uni_exps_indices] print 'unique exp nums = ', uni_exps_indices return spike_times if __name__ == "__main__": if len(sys.argv) < 2: mat_files = get_files('mat', os.getcwd()) elif len(sys.argv) > 1: mat_files = [sys.argv[1]] else: print 'hwwwhat?' print mat_files for file in mat_files: spikes = sio.loadmat(file) clust_name = sio.whosmat(file)[0][0] print 'whosmat!! ', spikes[clust_name].shape plot_waveforms(spikes, clust_name) #spike_trials(spikes,clust_name)
def main(): args = parse_args() init_log(args.log) with open(args.app) as app_file: # parse and validate the requested data application JSON file application = Application(app_file) logging.info("Input data application parsed: {}".format(args.app)) # Create output directory for the results application_dir = create_app_dir(application) # check what data types are allowed for this application allowed_data_types = application.allowed_data_types() logging.info("Allowed data types: {}".format( ' '.join(allowed_data_types))) if len(allowed_data_types) > 0: # Get all the sample metadata for all requested cohorts requested_cohorts = application.cohorts() metadata = Metadata(args.data, requested_cohorts) logging.info("Metadata collected for requested cohorts: {}".format( ' '.join(requested_cohorts))) metadata_sample_ids = sorted(metadata.get_sample_ids()) logging.info("Metadata for sample IDs: {}".format( ' '.join(metadata_sample_ids))) # Filter the sample metadata based on patient consent metadata.filter_consent(args.consent, allowed_data_types) logging.warning("Consent not handled yet. FIXME") # Find all the file paths for requested file types for each # consented sample requested_file_types = application.file_types() logging.info("Requested file types: {}".format( ' '.join(requested_file_types))) fastqs, bams, bais, vcfs = get_files(args.data, requested_file_types, metadata) logging.info("VCF files selected:\n{}".format('\n'.join(vcfs))) logging.info("BAM files selected:\n{}".format('\n'.join(bams))) logging.info("BAI files selected:\n{}".format('\n'.join(bais))) logging.info("FASTQ files selected:\n{}".format('\n'.join(fastqs))) output_files = [] if 'Anonymised' in allowed_data_types: # generate random IDs for all output samples randomised_ids = make_random_ids(args.usedids, metadata.sample_ids) metadata.anonymise(randomised_ids) metadata.write(args.metaout) logging.info("Anonymised metadata written to: {}".format( args.metaout)) new_vcfs = anonymise_files(vcfs, randomised_ids, application_dir, VCF_filename, vcf_edit) new_bams = anonymise_files(bams, randomised_ids, application_dir, BAM_filename, bam_edit) # BAIs and FASTQs are just sym-linked to output with randomised name new_bais = anonymise_files(bais, randomised_ids, application_dir, BAI_filename) new_fastqs = anonymise_files(fastqs, randomised_ids, application_dir, FASTQ_filename) output_files.extend(new_vcfs + new_bams + new_bais + new_fastqs) logging.info("Output files are anonymised") elif 'Re-identifiable' in allowed_data_types: new_links = link_files(application_dir, vcfs + bams + bais + fastqs) output_files.extend(new_links) logging.info( "Files linked in directory: {}".format(application_dir)) metadata.write(args.metaout) logging.info("Output files are re-identifiable") else: print_error( "Allowed data is neither anonymised nor re-identifiable") exit(ERROR_BAD_ALLOWED_DATA) logging.info("Generating MD5 checksums on output files") md5_files(args.md5, output_files) else: logging.warning("No data available for this application")
def find_primary_keys(data_source, data_sample): # define an empty dict for the keys primary_keys = {} # get information needed to read files dat_path, dat_source, sample_data, files = gf.get_files( data_source, data_sample) # read in each file and attempt to find any primary keys for f in files: if os.path.splitext(f)[1] == '.csv': # define an empty dictionary for the columns cols = {} # read in each csv file dat = pd.read_csv(os.path.join(sample_data, f), na_values=missing_values, low_memory=False) # get the total number of rows rows = dat.shape[0] # get list of columns columns = dat.columns # print(f) # print(rows) # loop through each column and determine the proportion of unique values for col in columns: # try to convert potential date columns # if dat[col].dtype == 'object': # try: # dat[col] = pd.to_datetime(dat[col]) # except: # "" unique_vals = list(dat[col].unique()) unique_vals_length = len(dat[col].unique()) proportion_unique = unique_vals_length / rows # if the proportion == 1, then we can infer primary key # if proportion_unique == 1: # an alternate is so introduce the possibility for data errors and decrease the proportion if proportion_unique >= .95: cols[col] = unique_vals primary_keys[f] = cols # print(f) # print(unique_vals) # print(proportion_unique) # create the file path to write a json file of the results to json_path = os.path.join(os.path.normpath(dat_path + os.sep + os.pardir), 'results') # create the name for the json file json_name = data_sample + '_primarykeys' + '.json' # write the json file with open(os.path.join(json_path, json_name), 'w') as file: json.dump(primary_keys, file, cls=ne.NumpyEncoder, indent=2) # return the keys return primary_keys
def find_relationships(data_source, data_sample): wharf_dict = {} # get information needed to read files dat_path, dat_source, sample_data, files = gf.get_files( data_source, data_sample) # read in each file and attempt to find any primary keys for f in files: # read in each csv file dat = pd.read_csv(os.path.join(sample_data, f), na_values=missing_values) dat.columns = dat.columns.str.replace(' ', '_') # import the data rows = dat.shape[0] # find the cardinality of each column for col in dat.columns: # find potential keys unique_vals = len(dat[col].unique()) proportion_unique = unique_vals / rows # if the column is a key, ignore it if proportion_unique == 1 and col not in ignore_columns: ignore_columns.append(col) # get the column type # col_type = dat[col].dtypes # ignore columns that don't appear to be boolean integers # if str(col_type) in ['float', 'float64', 'int', 'int64'] and unique_vals > 2 and col not in ignore_columns: # ignore_columns.append(col) # get all column combinations column_combinations = permutations(dat.columns, 2) # extract column names and get them into list form column_combo_lists = [' '.join(i) for i in column_combinations] column_combo_lists0 = list( OrderedDict.fromkeys([ i.split()[0] for i in column_combo_lists if i.split()[0] not in ignore_columns ])) column_combo_lists1 = list( OrderedDict.fromkeys([ i.split()[1] for i in column_combo_lists if i.split()[1] not in ignore_columns ])) # sort the lists column_combo_lists0.sort() column_combo_lists1.sort() # create an empty matrix # wharf_matrix = np.zeros((len(column_combo_lists0), len(column_combo_lists0))) # fill nan values # dat.fillna(0) # compute the wharf coefficient for each column combination for counter0, col1 in enumerate(column_combo_lists0): for counter1, col2 in enumerate(column_combo_lists1): if col1 == col2: pass elif col2 in column_combo_lists0: # compute the wharf coefficient w = (dat.groupby([col1, col2])[col2].count().max( level=0)).sum() / dat.shape[0] # add the column combination to the dictionary if w >= .9: wharf_dict[col1 + '|||' + col2] = w # remove the col1 value to ensure there are no duplicate values returned column_combo_lists0.remove(col1) # create the name for the json file json_name = data_sample + '_relationships' + '.json' # find the file path for the json file to be written to json_path = os.path.join(os.path.normpath(dat_path + os.sep + os.pardir), 'results') # write the json file with open(os.path.join(json_path, json_name), 'w') as file: json.dump(wharf_dict, file, indent=2) return wharf_dict