Example #1
0
def is_in_path(sha):
    """Returns boolean
    Arguments:
        sha - the sha of the file
    """
    files = get_files()
    for local_file in files:
        if local_file.get('sha') == sha:
            return True
    return False
Example #2
0
def main():
    opts, word, args = parse_options()
    filelist = get_files(args[0], opts.recurse)
    work_queue = queue.Queue()
    for i in range(opts.count):
        number = '{0}:'.format(i + 1) if opts.debug else ""
        worker = Worker(work_queue, word, number)
        worker.daemon = True
        worker.start()
    for filename in filelist:
        work_queue.put(filename)
    work_queue.join()
def download_links(channel_name, vods_clips, rename, try_muted, file_path,
                   file_name_vods, file_name_clips):
    """Download clips and/or vods listed in the data file"""
    logger = logging.getLogger(__name__)
    abs_file_path = Path.resolve(Path.cwd() / file_path)
    if vods_clips == "vods" or vods_clips == "both":
        logger.info("starting vods download ...")
        get_files.get_files(file_name_vods,
                            rename,
                            "vods",
                            try_muted,
                            loglevel="INFO")
        logger.info(
            f"vods downloaded at '{abs_file_path / channel_name / 'vods'}'\n")

    if vods_clips == "clips" or vods_clips == "both":
        logger.info("starting clips download ...")
        get_files.get_files(file_name_clips, rename, "clips", loglevel="INFO")
        logger.info(
            f"clips downloaded at '{abs_file_path / channel_name / 'clips'}'\n"
        )
 def __init__(self, *args):
     self.project = os.getenv('PROJECT','jay_files')
     self.workdir = os.getenv('DESIGN_DIR',os.path.dirname(os.path.abspath(sys.argv[0])))
     self.cwp = os.path.dirname(os.path.abspath(os.path.realpath(sys.argv[0])))
     self.filelist = self.project + ".list"
     self.filelist = os.path.join(self.cwp,self.filelist)
     self.file_search = get_files(list_name = self.filelist, path = self.workdir)
     self.database = self.project + ".db"
     self.database = os.path.join(self.cwp,self.database)
     if args:
         self.filetarget = args
     else:
         self.filetarget = ['c','h','v','vh','sv','pl','py','pm','vhd', 's', 'sh', 'arg']
     print("target file type are : "+" ".join(self.filetarget))
     print("target project is : " + self.project)
     print("target directory : "+self.workdir)
     print("current directory : "+self.cwp)
Example #5
0
def sex_call():
    """
    Runs sextractor on all members of input list1.

    :param list1: list of fits/FITS files in cwd
    :return: False if subprocess fails, otherwise returns True
    """
    # Set current working directory to /Users/jaredhand/Documents/photometry/data_in/
    os.chdir(config.sex_directory)
    # str for sextractor script
    script = 'sex -CATALOG_NAME %s %s'
    files = get_files.get_files()
    if not files:
        print('No .fit or .FIT files in /data_in folder.')
        return False

    for i in files:
        cat_name = i.split('.')[0] + '.cat'
        try:
            sp.check_call(script % (cat_name, i), shell=True)
        except:
            raise
    return True
Example #6
0
    all_trials = np.delete(all_trials, erase_trials, axis=1)
    print 'shape of all_trials = ', all_trials.shape
    return all_trials


if __name__ == "__main__":
    global fs
    fs = 30e3

    #data = np.random.rand(18e4,10)
    #length = len(data)
    #time = np.linspace(-0.5*length/fs,0.5*length/fs,length)
    #time = downsample(time, 10)
    #plot_raw_traces(data,time,18,12,'./psth/channel_13')

    raw_files = get_files('kwd', os.getcwd())  # collect the kwd files
    print 'raw files = ', raw_files
    metadata = get_file_metadata(raw_files)  # get their lengths + timestamps

    print 'metadata[data_len] = ', [meta['data_len'] for meta in metadata]
    print 'first = ', metadata[0]['data_len']

    ######### !!!!! channel numbering starts with 0 !!!!!!!!!###############
    channels = np.array([15])  #  np.arange(32) #
    newephys = concatenate_kwd(raw_files, channels)
    print 'newephys shape = ', newephys.shape

    stim_data = pd.read_csv(
        'oe_stim_times.csv'
    )  # read the CSV file with stimulus times and orientations
    stim_data.times = np.rint(stim_data.times * fs)
parser.add_argument('--sets',
                    choices=('all', 'train', 'valid', 'test'),
                    help='Determine which sets from trian,valid and test')
opts = parser.parse_args()

ROOT_DIR = opts.root_dir
ROUNDNAME = opts.round
Round = int(ROUNDNAME[-1])
maxPoint = opts.maxPoint
penality = opts.penality
trainid = opts.trainid
SET = opts.sets

print('Report of ' + SET + 'set')

train_sets, valid_sets, test_sets = get_files(ROOT_DIR, Round)
if (SET == 'all'):
    sets = np.concatenate((train_sets, valid_sets, test_sets))
elif (SET == 'train'):
    sets = train_sets

elif (SET == 'valid'):
    sets = valid_sets

elif (SET == 'test'):
    sets = test_sets

RealNum25 = 0
RealNum50 = 0
RealNum100 = 0
ground_truth = 0
Example #8
0
if update and ((not search_dir) or (not ana_type)):
   raise ValueError("Need directory and ana_type for Update")
if (not update) and ((not search_type) or (not target_name)):
   raise ValueError("Need search_type and target_name")

if ana_type == "sv":
   file_type=("sv","svh")
   table_name = "classes"
   ana_len = 3
else:
   raise ValueError("wait for adding")

if update:
   if not os.path.isdir(search_dir):
      raise ValueError("Illegal Directory")
   jay_get_files = get_files(list_name = temp_file)
   jay_get_files.search_files(*file_type, path = search_dir )

   db_file = os.path.join(work_dir,"db",db_files[0])
   jay_analysis_files = analysis_files(output_db = db_file)
   jay_analysis_files.update_db.set_verbose()
   jay_analysis_files.update_db(arg_table_name = table_name, arg_ana_type = ana_type, arg_file_list = temp_file, arg_ana_len = ana_len)
else:
   jay_search_item = search_item()
   for db_i in db_files:
       db_file = os.path.join(work_dir,"db",db_i)
       jay_search_item.config(target_type = ana_type, db = db_file, table = table_name, level = 1, open_result = True)
       jay_search_item.patch_search(target_name, target_type = ana_type, search_type = search_type)
       if jay_search_item.hit:
           break
def find_foreign_keys(data_source, data_sample):

    foreign_keys = {}

    # get information needed to read files
    dat_path, dat_source, sample_data, files = gf.get_files(
        data_source, data_sample)
    json_path = os.path.join(os.path.normpath(dat_path + os.sep + os.pardir),
                             'results')
    json_name_primary = data_sample + '_primarykeys' + '.json'

    # get the primary keys from the json file created with the primary key detection module
    with open(os.path.join(json_path, json_name_primary)) as file:
        primary_keys = json.load(file)

    # extract the primary keys (files with primary keys)
    keys_files = list(primary_keys.keys())

    # loop through the files that do not contain primary keys
    for f in list(files):

        if f not in keys_files:
            dat_foreign = pd.read_csv(os.path.join(sample_data, f),
                                      na_values=missing_values,
                                      low_memory=False,
                                      encoding='latin1')

            # get the columns
            columns = dat_foreign.columns

            # extract the primary key values from the primary key dictionary
            for key in keys_files:

                key_files_columns = primary_keys[key].keys()

                for key_column in key_files_columns:

                    temp_primary_keys = primary_keys[key][key_column]

                    # loop through the columns and get unique values:
                    for col in columns:

                        temp_foreign_keys = dat_foreign[col].unique()

                        # find the proportion of potential foreign key values that are in the list of primary key values
                        foreign_key_bool = set(temp_foreign_keys).intersection(
                            temp_primary_keys)
                        prop_foreign = len(foreign_key_bool) / len(
                            temp_primary_keys)

                        # if the proportion > 0, we can infer foreign key
                        if prop_foreign > 0:

                            if f not in foreign_keys.keys():
                                foreign_keys[f] = {}
                                foreign_keys[f][col] = prop_foreign

                            else:
                                foreign_keys[f][col] = prop_foreign

    # create the name for the json file
    json_name_foreign = data_sample + '_foreignkeys' + '.json'

    # write the json file
    with open(os.path.join(json_path, json_name_foreign), 'w') as file:
        json.dump(foreign_keys, file, cls=ne.NumpyEncoder, indent=2)

    # return the foreign keys
    return foreign_keys
Example #10
0
import open_tropomi as ot 
import get_files as gf 
import numpy as np 
import xarray as xr 
import pandas as pd 
import time

# VARIABLES 
year = 2020
# month = 5
# day = 5
week_num = 17

##################
# Load files
start, end, calendar_week = gf.get_files(year=year, calendar_week=week_num)
 
try:
    file_list=open('inventory.txt','r')
except:
    print('Did not find a text file containing file names (perhaps name does not match)')
    sys.exit()

ds_list = []

startiest_time = time.time() 

for test_file in file_list:
    test_file = test_file.strip()
    start_time = time.time()
    ds_list.append(ot.dsread(test_file))
    for idx, unique_exp in enumerate(uni_exps):
        if idx > 0:
            spike_times[all_exps == unique_exp] += experiment_lengths[idx]

    print 'spikes shape = ', spikes[clust_name]['times'][0][0][0][0].shape
    print 'uni times = ', spikes[clust_name]['times'][0][0][0][0][
        0, uni_exps_indices]
    print 'unique exp nums = ', uni_exps_indices

    return spike_times


if __name__ == "__main__":

    if len(sys.argv) < 2:
        mat_files = get_files('mat', os.getcwd())
    elif len(sys.argv) > 1:
        mat_files = [sys.argv[1]]
    else:
        print 'hwwwhat?'

    print mat_files

    for file in mat_files:

        spikes = sio.loadmat(file)

        clust_name = sio.whosmat(file)[0][0]
        print 'whosmat!! ', spikes[clust_name].shape
        plot_waveforms(spikes, clust_name)
        #spike_trials(spikes,clust_name)
Example #12
0
def main():
    args = parse_args()
    init_log(args.log)
    with open(args.app) as app_file:
        # parse and validate the requested data application JSON file
        application = Application(app_file)
        logging.info("Input data application parsed: {}".format(args.app))
        # Create output directory for the results
        application_dir = create_app_dir(application)
        # check what data types are allowed for this application
        allowed_data_types = application.allowed_data_types()
        logging.info("Allowed data types: {}".format(
            ' '.join(allowed_data_types)))
        if len(allowed_data_types) > 0:
            # Get all the sample metadata for all requested cohorts
            requested_cohorts = application.cohorts()
            metadata = Metadata(args.data, requested_cohorts)
            logging.info("Metadata collected for requested cohorts: {}".format(
                ' '.join(requested_cohorts)))
            metadata_sample_ids = sorted(metadata.get_sample_ids())
            logging.info("Metadata for sample IDs: {}".format(
                ' '.join(metadata_sample_ids)))
            # Filter the sample metadata based on patient consent
            metadata.filter_consent(args.consent, allowed_data_types)
            logging.warning("Consent not handled yet. FIXME")
            # Find all the file paths for requested file types for each
            # consented sample
            requested_file_types = application.file_types()
            logging.info("Requested file types: {}".format(
                ' '.join(requested_file_types)))
            fastqs, bams, bais, vcfs = get_files(args.data,
                                                 requested_file_types,
                                                 metadata)
            logging.info("VCF files selected:\n{}".format('\n'.join(vcfs)))
            logging.info("BAM files selected:\n{}".format('\n'.join(bams)))
            logging.info("BAI files selected:\n{}".format('\n'.join(bais)))
            logging.info("FASTQ files selected:\n{}".format('\n'.join(fastqs)))
            output_files = []
            if 'Anonymised' in allowed_data_types:
                # generate random IDs for all output samples
                randomised_ids = make_random_ids(args.usedids,
                                                 metadata.sample_ids)
                metadata.anonymise(randomised_ids)
                metadata.write(args.metaout)
                logging.info("Anonymised metadata written to: {}".format(
                    args.metaout))
                new_vcfs = anonymise_files(vcfs, randomised_ids,
                                           application_dir, VCF_filename,
                                           vcf_edit)
                new_bams = anonymise_files(bams, randomised_ids,
                                           application_dir, BAM_filename,
                                           bam_edit)
                # BAIs and FASTQs are just sym-linked to output with randomised name
                new_bais = anonymise_files(bais, randomised_ids,
                                           application_dir, BAI_filename)
                new_fastqs = anonymise_files(fastqs, randomised_ids,
                                             application_dir, FASTQ_filename)
                output_files.extend(new_vcfs + new_bams + new_bais +
                                    new_fastqs)
                logging.info("Output files are anonymised")
            elif 'Re-identifiable' in allowed_data_types:
                new_links = link_files(application_dir,
                                       vcfs + bams + bais + fastqs)
                output_files.extend(new_links)
                logging.info(
                    "Files linked in directory: {}".format(application_dir))
                metadata.write(args.metaout)
                logging.info("Output files are re-identifiable")
            else:
                print_error(
                    "Allowed data is neither anonymised nor re-identifiable")
                exit(ERROR_BAD_ALLOWED_DATA)
            logging.info("Generating MD5 checksums on output files")
            md5_files(args.md5, output_files)
        else:
            logging.warning("No data available for this application")
def find_primary_keys(data_source, data_sample):

    # define an empty dict for the keys
    primary_keys = {}

    # get information needed to read files
    dat_path, dat_source, sample_data, files = gf.get_files(
        data_source, data_sample)

    # read in each file and attempt to find any primary keys
    for f in files:

        if os.path.splitext(f)[1] == '.csv':

            # define an empty dictionary for the columns
            cols = {}

            # read in each csv file
            dat = pd.read_csv(os.path.join(sample_data, f),
                              na_values=missing_values,
                              low_memory=False)

            # get the total number of rows
            rows = dat.shape[0]

            # get list of columns
            columns = dat.columns

            # print(f)
            # print(rows)

            # loop through each column and determine the proportion of unique values
            for col in columns:

                # try to convert potential date columns
                # if dat[col].dtype == 'object':
                # try:
                # dat[col] = pd.to_datetime(dat[col])
                # except:
                # ""

                unique_vals = list(dat[col].unique())
                unique_vals_length = len(dat[col].unique())
                proportion_unique = unique_vals_length / rows

                # if the proportion == 1, then we can infer primary key
                # if proportion_unique == 1:
                # an alternate is so introduce the possibility for data errors and decrease the proportion
                if proportion_unique >= .95:
                    cols[col] = unique_vals
                    primary_keys[f] = cols

                # print(f)
                # print(unique_vals)
                # print(proportion_unique)

    # create the file path to write a json file of the results to
    json_path = os.path.join(os.path.normpath(dat_path + os.sep + os.pardir),
                             'results')

    # create the name for the json file
    json_name = data_sample + '_primarykeys' + '.json'

    # write the json file
    with open(os.path.join(json_path, json_name), 'w') as file:
        json.dump(primary_keys, file, cls=ne.NumpyEncoder, indent=2)

    # return the keys
    return primary_keys
Example #14
0
def find_relationships(data_source, data_sample):

    wharf_dict = {}

    # get information needed to read files
    dat_path, dat_source, sample_data, files = gf.get_files(
        data_source, data_sample)

    # read in each file and attempt to find any primary keys
    for f in files:

        # read in each csv file
        dat = pd.read_csv(os.path.join(sample_data, f),
                          na_values=missing_values)
        dat.columns = dat.columns.str.replace(' ', '_')

        # import the data
        rows = dat.shape[0]

        # find the cardinality of each column
        for col in dat.columns:

            # find potential keys
            unique_vals = len(dat[col].unique())
            proportion_unique = unique_vals / rows

            # if the column is a key, ignore it
            if proportion_unique == 1 and col not in ignore_columns:
                ignore_columns.append(col)

            # get the column type
            # col_type = dat[col].dtypes

            # ignore columns that don't appear to be boolean integers
            # if str(col_type) in ['float', 'float64', 'int', 'int64'] and unique_vals > 2 and col not in ignore_columns:
            # ignore_columns.append(col)

        # get all column combinations
        column_combinations = permutations(dat.columns, 2)

        # extract column names and get them into list form
        column_combo_lists = [' '.join(i) for i in column_combinations]
        column_combo_lists0 = list(
            OrderedDict.fromkeys([
                i.split()[0] for i in column_combo_lists
                if i.split()[0] not in ignore_columns
            ]))
        column_combo_lists1 = list(
            OrderedDict.fromkeys([
                i.split()[1] for i in column_combo_lists
                if i.split()[1] not in ignore_columns
            ]))

        # sort the lists
        column_combo_lists0.sort()
        column_combo_lists1.sort()

        # create an empty matrix
        # wharf_matrix = np.zeros((len(column_combo_lists0), len(column_combo_lists0)))

        # fill nan values
        # dat.fillna(0)

        # compute the wharf coefficient for each column combination
        for counter0, col1 in enumerate(column_combo_lists0):

            for counter1, col2 in enumerate(column_combo_lists1):

                if col1 == col2:
                    pass

                elif col2 in column_combo_lists0:

                    # compute the wharf coefficient
                    w = (dat.groupby([col1, col2])[col2].count().max(
                        level=0)).sum() / dat.shape[0]

                    # add the column combination to the dictionary
                    if w >= .9:
                        wharf_dict[col1 + '|||' + col2] = w

            # remove the col1 value to ensure there are no duplicate values returned
            column_combo_lists0.remove(col1)

    # create the name for the json file
    json_name = data_sample + '_relationships' + '.json'

    # find the file path for the json file to be written to
    json_path = os.path.join(os.path.normpath(dat_path + os.sep + os.pardir),
                             'results')

    # write the json file
    with open(os.path.join(json_path, json_name), 'w') as file:
        json.dump(wharf_dict, file, indent=2)

    return wharf_dict