Exemple #1
0
def put(conn: socket, args=None):
    # recv file from client and write to file
    print('receiving file...')
    client_data = json.loads(_bytes_to_string(recv_msg(conn)))

    args['filename'] = os.path.join('server_files', args['filename'])

    data = client_data['data']
    if data is None:
        print("Problem: data received is None")
    print("got the file data!: {}Bytes".format(len(data)))

    if not os.path.isdir('./server_files'):
        os.mkdir('./server_files')

    filename = os.path.join('server_files', path_leaf(args['filename']))

    print('iv=', client_data['iv'])

    with open(filename, 'wb+') as f:
        plaintext = args['cipherfunc'](data=data,
                                       key=args['key'],
                                       decrypt=True,
                                       iv=client_data['iv'])
        f.write(plaintext)

    print('recieved file:', args['filename'])

    if os.path.isfile(filename):
        subprocess.Popen(r'explorer /select,"{}"'.format(args['filename']))
Exemple #2
0
def get_file_paths(filename_extension):
    if 'demo' in sys.argv:
        from settings import DEFAULT_PATHS
        return DEFAULT_PATHS

    while True:
        print 'Please, select a .{} file(s)'.format(filename_extension)
        root = Tkinter.Tk()
        root.withdraw()
        paths = tkFileDialog.askopenfilenames(parent=root,
                                              title='Choose a file(s)')
        for path in paths:
            if not check_filename_extension(
                    path=path, filename_extension=filename_extension):
                print('{} is not .{} file. Choose file(s) again.'.format(
                    path_leaf(path), filename_extension))
                paths = []
                root.update()
                root.destroy()
                break
        if not paths:
            continue
        print 'Your file(s):\n{}\nAll right?'.format('\n'.join(paths))
        while True:
            answer = raw_input('Print yes or no\n')
            if 'yes' == answer.strip().lower() or 'y' == answer.strip().lower(
            ):
                return paths
            elif 'no' == answer.strip().lower() or 'n' == answer.strip().lower(
            ):
                continue
Exemple #3
0
    def callback(conn: socket):
        # receive data
        resp = json.loads(_bytes_to_string(recv_msg(conn)))

        if 'file_index' in args and args['file_index'] == True:
            args['filename'] = resp['filename']
            del args['file_index']

        if not os.path.isdir('./client_files'):
            os.mkdir('./client_files')

        filename = os.path.join('client_files', path_leaf(args['filename']))

        if os.path.isdir(filename):
            args['filename'] = os.path.join(args['filename'], resp['filename'])

        # === done preparing filesystem ===

        with open(filename, 'wb+') as f:
            plaintext = args['cipherfunc'](data=resp['data'],
                                           key=args['key'],
                                           decrypt=True,
                                           iv=resp['iv'])
            f.write(plaintext)
            if os.path.isfile(filename):
                subprocess.Popen(r'explorer /select,"{}"'.format(filename))
Exemple #4
0
def put(args: dict):

    args['iv'] = secrets.token_bytes(16)

    if 'file_index' in args and args[
            'file_index'] == True:  # if access-by-fileindex, then remove attr (to prevent issues) and get filename
        del args['file_index']
        file_index = int(args['filename'])
        args['filename'] = ls_local(args)[file_index]

    filename = os.path.join('client_files', path_leaf(args['filename']))

    if not os.path.isfile(filename):  # check if file exists
        print('ERROR: File "{}" doesn\'t exist'.format(filename))
        return

    def callback(conn: socket):
        ciphertext = b''
        with open(filename, 'rb') as f:
            data = f.read()
            ciphertext = args['cipherfunc'](data=data,
                                            key=args['key'],
                                            iv=args['iv'])

        return send_msg(
            conn,
            _string_to_bytes(
                json.dumps({
                    'filename': filename,
                    'data': _bytes_to_string(ciphertext),
                    'iv': _bytes_to_string(args['iv']),
                })))

    return send_command(args, callback)
Exemple #5
0
 def moveTrackToGenreFolder(trackPath, folderName):
     pathFolderName = "{0}/{1}".format(MusicManager.__getRootFolder(),
                                       folderName)
     if not os.path.exists(pathFolderName):
         os.makedirs(pathFolderName)
     pathFolderNameWithTrackName = "{0}/{1}".format(
         pathFolderName, utils.path_leaf(trackPath))
     print("Copy {0}\t\tTO\t\t{1}".format(trackPath,
                                          pathFolderNameWithTrackName))
     shutil.copy(trackPath, pathFolderNameWithTrackName)
Exemple #6
0
def download_pdf(pdf_urls, dest_path, gdrive):
	if gdrive:
		for url in pdf_urls:
			gdrive_id = utils.get_gdrive_id(url)
			file_name = gdrive_id + ".pdf"
			utils.download_gdrive(gdrive_id, dest_path + file_name)
	else:
		for url in pdf_urls:
			download_script = "wget " + url
			move_script = "mv " + utils.path_leaf(url) + " " + dest_path
			os.system(download_script)
			os.system(move_script)
Exemple #7
0
def main(args):
    global config_data
    now = time.strftime("%c")

    ## parse input arguement
    #check_argument()

    config_path = op.abspath(args.config_file)
    output_directory, output_prefix = utils.path_leaf(config_path)

    # read the config file
    file = utils.custparser()
    file.read(args.config_file)
    config_data = file.as_dict()
    logfile = op.join(output_directory, 'chronqc_crongen.log')
    #print(logfile)
    logging.basicConfig(filename=op.join(output_directory,
                                         'chronqc_crongen.log'),
                        level=logging.DEBUG)
    #print(output_directory)
    logging.info('STARTED crongen on %s' % now)

    try:
        ## set output directory and directory to be displayed in email
        to_directory = config_data["iomanip"]["destination"]
        display_directory = ""
        if ("display_destination" in config_data["iomanip"].keys()) and (
                config_data["iomanip"]["display_destination"] != ""):
            display_directory = config_data["iomanip"]["display_destination"]
        else:
            display_directory = to_directory
        ## make directory for this month
        curr_date = time.strftime("%d_%b_%Y")
        to_directory = os.path.join(to_directory, curr_date)
        display_directory = os.path.join(display_directory, curr_date)
        logging.info('ABS_PATH: %s DISPLAY_PATH: %s' %
                     (to_directory, display_directory))
        link_dict = call_plots(to_directory)

        ## email users
        compose_mail(link_dict, display_directory)
    except Exception:
        logging.error(traceback.format_exc())
        alert_admin(traceback.format_exc())
        logging.info(
            'Error encountered while creating ChronQC plots: please see {0} for details.'
            .format(logfile))
        #print('Error encountered while creating ChronQC plots: please see {0} for details.'.format(logfile))
        sys.exit(1)
    logging.info(
        'Completed creating ChronQC plots: please see {0} for details.'.format(
            logfile))
Exemple #8
0
def pdf2json(file_paths, json_out, squash=True, titles=None):
    data = {'data': [], 'version': '1.1'}

    for i, file_path in enumerate(file_paths):
        _, paragraphs = pdf_reader.read_pdf(file_path, squash)
        title = titles[i] if titles else utils.path_leaf(file_path)
        doc = {
            'paragraphs': [],
            'title': title,
            'department': '',
            'chapter': ''
        }
        for p in paragraphs:
            doc['paragraphs'].append({'context': p, 'qas': []})
        data['data'].append(doc)

    with open(json_out, 'w') as f:
        json.dump(data, f)
Exemple #9
0
def thread_target(eula_file, predict_method):

    print("============ eula_file : ", eula_file, " ============")
    """
    clause_list = get_content(eula_file)
    clause_dic = convert_to_clauses(clause_list)
    """
    _, _, clause_dic, _ = get_content(eula_file)

    clause_list = list(clause_dic.values())
    clauses_key = list(clause_dic.keys())

    Y = predict_method(clause_list)

    labels, probabilities = [], []

    for y in Y:
        a = max(y)
        probabilities.append(a)
        labels.append(y.index(a))

    file_name = path_leaf(path=eula_file)
    file_name, extension = os.path.splitext(file_name)

    file_name = file_name + "_" + extension.replace(".", "")

    csv_file = file_name + ".csv"

    if os.path.isfile(csv_file):
        i = 1
        while os.path.isfile(file_name + '.' + str(i) + ".csv"):
            i += 1
        csv_file = file_name + '.' + str(i) + '.csv'

    print("============ csv_file : ", csv_file, " ============")
    print()

    #pd.DataFrame(zip(clauses_key, clause_list, labels, probabilities)).to_csv(csv_file, header= ["clauses_id" ,"clauses", "labels", "probabilities"])
    pd.DataFrame(zip(clause_list, labels, probabilities)).to_csv(
        csv_file, header=["clauses", "labels", "probabilities"])
def create_horizontal_cut(path, data):
    print path
    data['nc_file'] = nc4.Dataset(path, mode='r')

    u_theta, v_theta, w_theta = get_theta_wind_matrix(path=path)

    if u_theta is None or v_theta is None or w_theta is None:
        return None

    data['file_name'] = path_leaf(path)
    data['hour'] = get_hour_from_nc_file(data=data)
    max_level = u_theta.shape[0] - 1
    if 'start_height' not in data.keys() or 'end_height' not in data.keys():
        data['start_height'], data['end_height'] = get_heights(
            max_level=max_level)

    for height in range(data.get('start_height'), data.get('end_height')):
        data['height'] = height
        plot_horizontal_cut(u=u_theta[height],
                            v=v_theta[height],
                            w=w_theta[height],
                            data=data)
Exemple #11
0
def read_pdf(file_path, squash=True, verbose=True):
	if verbose:
		print("Parsing", file_path)
	file_name = utils.path_leaf(file_path)
	if file_name[-4:] != ".pdf":
		raise TypeError("Expecting input of pdf file")

	paragraphs = []
	title = file_name[:-4]

	pdf_file_in = open(file_path, 'rb')
	rsrcmgr = PDFResourceManager()
	retstr = io.StringIO()
	laparams = LAParams()
	device = TextConverter(rsrcmgr, retstr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)

	page_no = 0
	for pageNumber, page in enumerate(PDFPage.get_pages(pdf_file_in)):
		if pageNumber == page_no:
			if verbose and pageNumber % 10 == 0:
				print("Read page", str(pageNumber))
			interpreter.process_page(page)
			data = retstr.getvalue()
			data = ''.join(x for x in data if x in string.printable)
			if squash:
				data = data.replace("\t", " ").replace("\n", " ")	
			else:
				data = data.replace("\t", " </tab> ").replace("\n", " </newline> ")
			data = re.sub("\s\s+" , " ", data)                       
			paragraphs.append(data)
			retstr.truncate(0)
			retstr.seek(0)
		page_no += 1

	pdf_file_in.close()

	return title, paragraphs
Exemple #12
0
def get(conn: socket, args=None):
    # send the file to client
    if args['file_index']:
        args['filename'] = os.listdir('server_files')[int(args['filename'])]

    iv = secrets.token_bytes(16)
    print('iv=', iv)

    filename = os.path.join('server_files', path_leaf(args['filename']))
    with open(filename, 'rb') as f:
        plaintext = f.read()
        ciphertext = args['cipherfunc'](data=plaintext, key=args['key'], iv=iv)

    print("finished reading file \"{}\", {}B".format(filename,
                                                     len(ciphertext)))

    return send_msg(
        conn,
        _string_to_bytes(
            json.dumps({
                'filename': filename,
                'data': _bytes_to_string(ciphertext),
                'iv': _bytes_to_string(iv),
            })))
if AUTO_CREATE_DIRS:
    pathlib.Path(path_head(SAVE_PREDS_PATH)).mkdir(parents=True, exist_ok=True)
    pathlib.Path(path_head(SAVE_RESULTS_PATH)).mkdir(parents=True,
                                                     exist_ok=True)

##------------------------------------------------------------------##
## Load (and save) data
##------------------------------------------------------------------##

# Load labels
labels = np.load(LABELS_PATH)

# Detect all the precomputed time point data frames
precomputed_df_paths = glob.glob(join(PRECOMPUTED_DIR_PATH,
                                      "time_point_*.csv"))
get_tp = lambda p: int(path_leaf(p).split("_")[-1].split(".")[0])
# Add time point info and sort by it
precomputed_df_paths = [(get_tp(p), p) for p in precomputed_df_paths]
precomputed_df_paths.sort(key=lambda x: int(x[0]))

if DEV_MODE:
    precomputed_df_paths = precomputed_df_paths[:5]

# Load the precomputed data frames
time_point_dfs = [(tp, pd.read_csv(path)) for tp, path in precomputed_df_paths]

##------------------------------------------------------------------##
## Running CV on all time points for a single participant
##------------------------------------------------------------------##

# Number of trials
Exemple #14
0
def main(args):
    """
    (args) -> html
    takes number of arguments and produces interactive chronqc html report
    """
    db = op.abspath(args.db)
    panel = args.panel
    templates_dir = op.abspath(op.join(op.dirname(__file__), 'templates'))

    # output dir and file
    # Get output directory 1. user defined 2. db dir 3. multiqc_stats dir
    # Output file name
    day = date.today().strftime("%d_%b_%Y")
    if args.prefix is not None:
        prefix = '{0}.{1}.{2}.{3}'.format(args.prefix, panel, 'chronqc', day)
    else:
        prefix = '{0}.{1}.{2}'.format(panel, 'chronqc', day)

    # Get output file
    if args.output is not None:
        output_directory = op.abspath(args.output)
    else:
        output_directory, output_prefix = utils.path_leaf(db)

    output_directory = op.join(output_directory, "chronqc_output")
    if not op.exists(output_directory):
        os.makedirs(output_directory)
    elif op.exists(output_directory) and not args.force:
        # logger.fatal("Output directory %s already exists", output_directory)
        print(
            "FATAL: Output directory {0} already exists, use -f to overwrite".
            format(output_directory))
        sys.exit(1)
    elif op.exists(output_directory) and args.force:
        pass
    # html report
    out_file = op.join(output_directory, "%s.html" % prefix)

    # create logger
    log_file = op.join(output_directory, 'chronqc.log')

    logging.basicConfig(filename=log_file,
                        format='%(asctime)s - %(name)s - %(levelname)s - \
                        %(message)s')
    logger = logging.getLogger('chronqc')
    logger.setLevel(logging.DEBUG)

    # create console handler and set level to debug
    ch = logging.StreamHandler()
    ch.setLevel(logging.CRITICAL)

    # create formatter
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -'
                                  '%(message)s')

    # add formatter to ch
    ch.setFormatter(formatter)

    # add ch to logger
    logger.addHandler(ch)
    logger.info("Started chronqc {0}".format(day))
    # read plot config
    f = op.abspath(args.json)
    try:
        config = json.load(io.open(f, 'r', encoding='utf-8-sig'), strict=False)
        logger.info("Got required parameters for chronqc")
    except ValueError:
        e = sys.exc_info()[1]
        logger.critical("FATAL: Error in JSON file {0}:{1}".format(
            e, op.abspath(args.json)))
        sys.exit(1)

    # enddate = date.today() + relativedelta(months=+1)
    # enddate = enddate.strftime('%Y-%m-%d')
# Create dictionary of data tobe filled in html file
    datetime = date.today()
    vals = {
        'htmltemplates': '',
        'calendartemplates': '',
        'javascripttemplate': '',
        'sidebartemplates': '',
        'j': '$j',
        'panel': panel,
        'startdate': '$startdate',
        'enddate': '$enddate',
        'datetime': datetime,
        'pdfname': '$pdfname',
        'table': '$table',
        'headers': '$headers',
        'rows': '$rows',
        'row': '$row',
        'cols': '$cols',
        'col': '$col',
        'text': '$text'
    }
    i = 1
    chart_ids = []
    group_ids = {}
    for chart in config:
        chart_id = 'g' + str(i)
        chart_ids.append(chart_id)
        table = chart.get('table_name', 'chronqc_stats_data')
        i = i + 1
        vals['htmltemplates'] = vals[
            'htmltemplates'] + '$' + chart_id + '_html' + '\n'
        vals['calendartemplates'] = vals[
            'calendartemplates'] + '$' + chart_id + '_calendar' + '\n'
        vals['javascripttemplate'] = vals[
            'javascripttemplate'] + '$' + chart_id + '_js' + '\n'
        group_side = '<p class="nav-item2"> {0}</p>\n'.format(
            table.replace('_', ' ').title())
        #vals['sidebartemplates'] = vals['sidebartemplates'] + '$' + chart_id + '_sidebar' + '\n'
        if table not in group_ids:
            group_ids[table] = ['$' + chart_id + '_sidebar']
            vals['sidebartemplates'] = vals[
                'sidebartemplates'] + group_side + '$' + chart_id + '_sidebar' + '\n'
        else:
            vals['sidebartemplates'] = vals[
                'sidebartemplates'] + '$' + chart_id + '_sidebar' + '\n'

    # SUBSTITUTION 1: create a template based on number of plots to be plotted
    tmpl = string.Template(
        open(op.join(templates_dir, "base_template.html")).read())
    tmpl = tmpl.substitute(**vals)
    logger.info("Finished creating base template based on number of plots")
    print('Started ChronQC')
    # SUBSTITUTION 2: for all plots to be plotted do data processing
    #   and substitute values in html, calander and js templates
    i = 1
    for chart in config:
        chart_id = 'g' + str(i)
        i = i + 1
        table = chart.get('table_name', 'chronqc_stats_data')
        column_name = chart["chart_properties"]["y_value"]
        include_samples = chart.get('include_samples', 'all')
        exclude_samples = chart.get('exclude_samples', '')
        per_sample = chart["chart_properties"].get('per_sample', 'False')
        categories = chart["chart_properties"].get('categories', '')
        category_str = ''
        ylabel2 = ''
        df_chart_cumsum = ''
        logger.info("Plotting {0}".format(chart_id))
        # Fetch data from the sqlite database
        df = fetch_stats_data(db,
                              table,
                              panel,
                              categories=categories,
                              ColumnName=column_name)
        logger.info("For {0}: got total {1} records".format(chart_id, len(df)))
        if len(df) == 0:
            logger.critical(
                "FATAL: For {0} {1}: no records found in {2}".format(
                    chart_id, column_name, table))
            sys.exit(1)
        # keep only desired samples
        try:
            df = get_samples_data(df,
                                  include_samples,
                                  exclude_samples,
                                  per_sample=per_sample)
        except KeyError:
            e = sys.exc_info()[1]
            logger.critical("FATAL: no {0} column found in {1}".format(
                e, table))
            sys.exit(1)
        except Exception:
            e = sys.exc_info()[1]
            logger.critical("FATAL: please check {0} column in {1}".format(
                e, table))
            sys.exit(1)
        if len(df) == 0:
            logger.critical(
                "FATAL: For {0} {1}: no records found for {2}".format(
                    chart_id, column_name, include_samples))
            sys.exit(1)
        logger.info("For {0}: kept {1} records after filtering".format(
            chart_id, len(df)))
        # dates for display
        startdate_year = df.loc[len(df) - 1,
                                'Date'].date() + relativedelta(months=-12)
        start_df = df.loc[0, 'Date'].date()
        if startdate_year > start_df:
            startdate = startdate_year.strftime('%Y-%m-%d')
        else:
            start_df = start_df + relativedelta(months=-1)
            startdate = start_df.strftime('%Y-%m-%d')
        enddate = df.loc[len(df) - 1, 'Date'] + relativedelta(months=+1)
        vals['startdate'] = startdate
        vals['enddate'] = enddate
        # process y formatting
        y = process_y(column_name)
        # generate data in format for html
        if chart['chart_type'] == 'time_series_with_percentage_category':
            cat = chart["chart_properties"].get('category', 'PASS')
            t = '% Samples per run with {0} = {1}'.format(y, cat)
            y = '% {0} = {1}'.format(y, cat)
            chart_title = chart["chart_properties"].get('chart_title', t)
            y_label = chart["chart_properties"].get('y_label', y)
            js_tmpl = string.Template(
                open(op.join(templates_dir,
                             "percent_plot_threshold.txt")).read())
            if not column_name in df.columns:
                logger.critical("FATAL: no {0} column found in {1}".format(
                    column_name, table))
                sys.exit(1)
            df_chart = percentage_category(df, column_name, cat)
            logger.info(
                "For {0}: {1} data points will be written to html".format(
                    chart_id, len(df_chart)))
        elif chart[
                'chart_type'] == 'time_series_with_percentage_of_samples_above_threshold':
            threshold = chart["chart_properties"]["threshold"]
            t = '% Samples per run with {0} ≥ {1}'.format(y, threshold)
            y = '% {0} ≥ {1}'.format(y, threshold)
            chart_title = chart["chart_properties"].get('chart_title', t)
            y_label = chart["chart_properties"].get('y_label', y)
            js_tmpl = string.Template(
                open(op.join(templates_dir,
                             "percent_plot_threshold.txt")).read())
            if not column_name in df.columns:
                logger.critical("FATAL: no {0} column found in {1}".format(
                    column_name, table))
                sys.exit(1)
            df_chart = percentage_of_samples_above_threshold(
                df, column_name, threshold)
            logger.info(
                "For {0}: {1} data points will be written to html".format(
                    chart_id, len(df_chart)))
        elif chart['chart_type'] == 'time_series_with_mean_and_stdev':
            win = chart["chart_properties"].get('window', '365D')
            info = 'sample' if per_sample == 'True' else 'run'
            try:
                win = int(win)
                winf = "past {0} {1}s".format(win, info)
            except:
                winf = "past {0} {1}s".format(win, info)
            if win == '365D':
                winf = "past 1 year {}s".format(info)
            if per_sample == 'False':
                t = '{0} (Mean per run with {1} rolling mean and ±2 standard deviation)'.format(
                    y, winf)
                y = '{0} (Mean per run)'.format(y)
            else:
                t = '{0} (with {1} rolling mean and ±2 standard deviation)'.format(
                    y, winf)
                y = '{0}'.format(y)
            chart_title = chart["chart_properties"].get('chart_title', t)
            y_label = chart["chart_properties"].get('y_label', y)
            js_tmpl = string.Template(
                open(op.join(templates_dir, "mean_and_stdev.txt")).read())
            if not column_name in df.columns:
                logger.critical("FATAL: no {0} column found in {1}".format(
                    column_name, table))
                sys.exit(1)
            df_chart = mean_and_stdev(df,
                                      column_name,
                                      win=win,
                                      per_sample=per_sample)
            logger.info(
                "For {0}: {1} data points will be written to html".format(
                    chart_id, len(df_chart)))
        elif chart['chart_type'] == 'time_series_with_absolute_threshold':
            if per_sample == 'False':
                t = '{0} (Mean per run)'.format(y)
                y = '{0} (Mean per run)'.format(y)
            else:
                t = '{0}'.format(y)
                y = '{0}'.format(y)
            chart_title = chart["chart_properties"].get('chart_title', t)
            y_label = chart["chart_properties"].get('y_label', y)
            js_tmpl = string.Template(
                open(op.join(templates_dir, "absolute_threshold.txt")).read())
            lower_threshold = chart["chart_properties"].get(
                "lower_threshold", np.nan)
            upper_threshold = chart["chart_properties"].get(
                "upper_threshold", np.nan)
            Type = chart["chart_properties"].get("Type", '')
            if not column_name in df.columns:
                logger.critical("FATAL: no {0} column found in {1}".format(
                    column_name, table))
                sys.exit(1)
            df_chart = absolute_threshold(df,
                                          column_name,
                                          lower_threshold=lower_threshold,
                                          upper_threshold=upper_threshold,
                                          Type=Type,
                                          per_sample=per_sample)
            #df_chart.to_clipboard(sep=',')
            logger.info(
                "For {0}: {1} data points will be written to html".format(
                    chart_id, len(df_chart)))
        elif chart['chart_type'] == 'time_series_with_box_whisker_plot':
            t = '{0} Monthly Box-and-Whisker Plot'.format(y)
            y = '{0}'.format(y)
            chart_title = chart["chart_properties"].get('chart_title', t)
            y_label = chart["chart_properties"].get('y_label', y)
            Type = chart["chart_properties"].get("Type", '')
            lower_threshold = chart["chart_properties"].get(
                "lower_threshold", np.nan)
            upper_threshold = chart["chart_properties"].get(
                "upper_threshold", np.nan)
            js_tmpl = string.Template(
                open(op.join(templates_dir, "box_whisker_plot.txt")).read())
            if not column_name in df.columns:
                logger.critical("FATAL: no {0} column found in {1}".format(
                    column_name, table))
                sys.exit(1)
            if Type != '':
                df_chart = box_whisker_plot(df,
                                            column_name,
                                            Type=Type,
                                            lower_threshold=lower_threshold,
                                            upper_threshold=upper_threshold)
            else:
                df_chart = box_whisker_plot(df,
                                            column_name,
                                            lower_threshold=lower_threshold,
                                            upper_threshold=upper_threshold)
            logger.info(
                "For {0}: {1} data points will be written to html".format(
                    chart_id, len(df_chart)))
        elif chart['chart_type'] == 'time_series_with_bar_line_plot':
            if categories == '':
                logger.critical(
                    "FATAL: no categories defined in JSON for time_series_with_bar_line_plot"
                )
                sys.exit(1)
            t = 'Monthly bar and line plot for {0} ({1})'.format(y, categories)
            y = 'Monthly count'
            chart_title = chart["chart_properties"].get('chart_title', t)
            y_label = chart["chart_properties"].get('y_label', y)
            y2 = 'Monthly total'
            ylabel2 = chart["chart_properties"].get('y_label2', y2)
            js_tmpl = string.Template(
                open(op.join(templates_dir, "bar_line_plot.txt")).read())
            if not column_name in df.columns:
                logger.critical("FATAL: no {0} column found in {1}".format(
                    column_name, table))
                sys.exit(1)
            df_chart = bar_line_plot(df, column_name)
            categories = df_chart.columns
            category_str = ''
            x = 0
            while x < len(categories) - 1:
                category_str = category_str + '"{0}", '.format(categories[x])
                x = x + 1
            if x == len(categories) - 1:
                category_str = category_str + ' "{0}"'.format(categories[x])
            df_chart['Data'] = df_chart.values.tolist()
            df_chart = pd.DataFrame(df_chart['Data'])
            logger.info(
                "For {0}: {1} data points will be written to html".format(
                    chart_id, len(df_chart)))
        elif chart['chart_type'] == 'time_series_with_stacked_bar_plot':
            if categories == '':
                logger.critical(
                    "FATAL: no categories defined in JSON for time_series_with_stacked_bar_plot"
                )
                sys.exit(1)
            t = 'Monthly stacked bar plot for {0} ({1})'.format(y, categories)
            y = 'Monthly count'
            chart_title = chart["chart_properties"].get('chart_title', t)
            y_label = chart["chart_properties"].get('y_label', y)
            js_tmpl = string.Template(
                open(op.join(templates_dir, "stacked_bar_plot.txt")).read())
            if not column_name in df.columns:
                logger.critical("FATAL: no {0} column found in {1}".format(
                    column_name, table))
                sys.exit(1)
            df_chart, df_chart_cumsum = stacked_bar_plot(df, column_name)
            categories = df_chart_cumsum.columns
            category_str = ''
            x = 0
            while x < len(categories) - 1:
                category_str = category_str + '"{0}", '.format(categories[x])
                x = x + 1
            if x == len(categories) - 1:
                category_str = category_str + ' "{0}"'.format(categories[x])
            df_chart['Data'] = df_chart.values.tolist()
            df_chart = pd.DataFrame(df_chart['Data'])
            df_chart_cumsum['Data'] = df_chart_cumsum.values.tolist()
            df_chart_cumsum = pd.DataFrame(df_chart_cumsum['Data'])
            logger.info(
                "For {0}: {1} data points will be written to html".format(
                    chart_id, len(df_chart)))
        else:
            logger.critical(
                "For {0}: No suitable chart_type is defined check JSON".format(
                    chart_id))
            sys.exit(1)
        # keep data in dir
        download_title = process_title(chart_title)
        vals = create_dir(vals,
                          df_chart,
                          chart_id,
                          chart_title,
                          y_label,
                          startdate,
                          enddate,
                          categories=category_str,
                          ylabel2=ylabel2,
                          df_chart_cumsum=df_chart_cumsum,
                          per_sample=per_sample,
                          column_name=download_title)
        # html template
        html_tmpl = string.Template(
            open(op.join(templates_dir, "html.txt")).read())
        vals[chart_id + '_html'] = html_tmpl.substitute(
            **vals[chart_id + 'htmltemplates'])
        logger.info(
            "For {0}: Finished creating html template".format(chart_id))
        # calendar template
        #calendar_tmpl = string.Template(open(op.join(templates_dir, "calendar.txt")).read())
        #vals[chart_id + '_calendar'] = calendar_tmpl.substitute(**vals[chart_id + 'htmltemplates'])
        #logger.info("For {0}: Finished creating calendar template".format(chart_id))
        # js template
        vals[chart_id + '_js'] = js_tmpl.substitute(**vals[chart_id +
                                                           'htmltemplates'])
        logger.info("For {0}: Finished creating js template".format(chart_id))
        # side bar with header
        sidebar_tmpl = string.Template(
            open(op.join(templates_dir, "sidebar.txt")).read())
        vals[chart_id + '_sidebar'] = sidebar_tmpl.substitute(
            **vals[chart_id + 'htmltemplates'])
        utils.print_progress(i + 1,
                             len(config) + 2,
                             prefix='Running ChronQC',
                             decimals=1,
                             bar_length=50)
    vals['pdfname'] = "%s.pdf" % prefix
    # substitute vals in main template
    tmpl = string.Template(tmpl).substitute(**vals)
    with io.open(out_file, "w", encoding='utf8') as fh:
        fh.write(tmpl)
    logger.info("Finished creating {0} chronqc plots: {1}".format(
        i - 1, out_file))
    print("Finished creating {0} chronqc plots: {1}".format(i - 1, out_file))
Exemple #15
0
    data_dir = sys.argv[1]
    if (not os.path.exists(data_dir) or not os.path.isdir(data_dir)):
        print("path '" + data_dir + "' is not a valid directory!")
        sys.exit(1)

    # get data file list
    data_fnames = glob.glob(os.path.join(data_dir, "*B.csv"))
    if (len(data_fnames) == 0):
        print("no csv files found in", data_dir + ". Exiting now...")
        sys.exit(0)

    # combine
    header_items = []
    data_combined = None
    for i, data_fname in enumerate(data_fnames):

        print("processing", data_fname)
        header_items.append(path_leaf(data_fname))

        data = np.loadtxt(data_fname, delimiter=',', dtype=np.uint64)
        if (data_combined is None):  # lazy-init when we know a reasonable size
            data_combined = np.empty(
                [data.shape[0], len(data_fnames)], dtype=np.uint64)
        data_combined[:data.shape[0], i] = data

    # gen output file
    print("saving output to combined.csv")
    np.savetxt(os.path.join(data_dir, "combined.csv"),
               data_combined,
               delimiter=',',
               header=','.join(header_items))
Exemple #16
0
def load_nwb_from_data(dir_path):

    # Get all files and directories present in the path
    files = utils.get_subfiles(dir_path)
    dirs = utils.get_subdirs(dir_path)
    files = files + dirs

    # Open YAML file with keywords, extension and keywords to exclude if existing then dump all data in a dict
    if os.path.isfile(dir_path + "\\" + [subfile for subfile in files if "default" in subfile][0]):
        with open(dir_path + "\\" + [subfile for subfile in files if "default" in subfile][0], 'r') as stream:
            data = yaml.safe_load(stream)
        # Remove the file from the list of files and directories so it isn't found twice
        files.remove([subfile for subfile in files if "default" in subfile][0])
    else:
        data = None

    home_data = dict()
    # Look for another YAML file containing the keywords, extensions and keywords to exclude
    for file in files:
        if "yaml" not in file:
            continue
        # p is a placeholder until we know every yaml file name
        if "subject" not in file and "ophys" not in file and "data" not in file and "p" not in file:
            with open(dir_path + "\\" + file, 'r') as stream:
                home_data = yaml.safe_load(stream)

    # If 2 files are provided, the one given by the user will take the priority
    if data is not None:
        difference = set(list(data.keys())) - set(list(home_data.keys()))
        for i in list(difference):
            home_data[i] = data[i]
    # First we create the nwb file because it will be needed for everything
    converttonwb = home_data.pop("ConvertToNWB")

    filtered_list = []
    for i in converttonwb:
        # If no extension is provided it means we are looking for a directory, so we filter the list of files and
        # directory to only contain directories
        if not converttonwb[i].get("extension"):
            filtered_list = [file for file in files if "." not in file]
        # Filter the file list depending on the extension provided in the YAML file
        else:
            for extension in converttonwb[i].get("extension"):
                filtered_list.extend([file for file in files if extension in file])
            # print("Filter result : " + str(filtered_list) + " by extension : " + str(converttonwb[i].get("extension")))
        # Conditional loop to remove all files or directories not containing the keywords
        # or containing excluded keywords
        counter = 0
        while counter < len(filtered_list):
            delete = False
            for keyword in converttonwb[i].get("keyword"):
                if keyword not in filtered_list[counter]:
                    # print("Keyword not found in : " + str(filtered_list))
                    del filtered_list[counter]
                    # print("New list : " + str(filtered_list))
                    delete = True
            if not delete:
                for keyword_to_exclude in converttonwb[i].get("keyword_to_exclude"):
                    if keyword_to_exclude in filtered_list[counter]:
                        # print("Excluded keyword found in : " + str(filtered_list))
                        del filtered_list[counter]
                        # print("New list : " + str(filtered_list))
                        delete = True
            if not delete:
                counter += 1
        print("Files to pass for " + i + ": " + str(filtered_list))
        # If files were found respecting every element, add the whole path to pass them as arguments
        yaml_path = os.path.join(dir_path, filtered_list[0])

    nwb_file = test_cicada_test_paul.create_nwb_file(yaml_path)

    order_list = []
    if home_data.get("order"):
        order_list = home_data.pop("order")

    while order_list:
        next_class = order_list.pop(0)
        # Get classname then instantiate it
        classname = getattr(test_cicada_test_paul, next_class)
        converter = classname(nwb_file)
        # Initialize a dict to contain the arguments to call convert
        arg_dict = {}
        print("Class name : " + str(next_class))
        # Loop through all arguments of the convert of the corresponding class
        for j in home_data[next_class]:
            filtered_list = []
            # If value if found it means the argument is not a file but a string/int/etc
            if home_data[next_class][j].get("value") and not home_data[next_class][j].get("extension") and \
                    (not home_data[next_class][j].get("keyword") or not home_data[next_class][j].get("keyword_to_exclude")):
                print(home_data[next_class][j].get("value")[0])
                arg_dict[j] = home_data[next_class][j].get("value")[0]
            else:
                # If no extension is provided it means we are looking for a directory, so we filter the list of files and
                # directory to only contain directories
                if not home_data[next_class][j].get("extension"):
                    filtered_list = [file for file in files if "." not in file]
                # Filter the file list depending on the extension provided in the YAML file
                else:
                    for extension in home_data[next_class][j].get("extension"):
                        filtered_list.extend([file for file in files if extension in file])
                    # print("Filter result : " + str(filtered_list) + " by extension : " +
                    # str(home_data[i][j].get("extension")))

                # Conditional loop to remove all files or directories not containing the keywords
                # or containing excluded keywords
                counter = 0
                while counter < len(filtered_list):
                    delete = False
                    for keyword in home_data[next_class][j].get("keyword"):
                        if keyword not in filtered_list[counter]:
                            # print("Keyword not found in : " + str(filtered_list))
                            del filtered_list[counter]
                            # print("New list : " + str(filtered_list))
                            delete = True
                    if not delete:
                        for keyword_to_exclude in home_data[next_class][j].get("keyword_to_exclude"):
                            if keyword_to_exclude in filtered_list[counter]:
                                # print("Excluded keyword found in : " + str(filtered_list))
                                del filtered_list[counter]
                                # print("New list : " + str(filtered_list))
                                delete = True
                    if not delete:
                        counter += 1
                print("Files to pass for " + j + ": " + str(filtered_list))
                # If files were found respecting every element, add the whole path to pass them as arguments
                if filtered_list:
                    arg_dict[j] = os.path.join(dir_path, filtered_list[0])
                    if "mat" in home_data[next_class][j].get("extension") and home_data[next_class][j].get("value"):
                        arg_dict[j] = [arg_dict[j]] + list(home_data[next_class][j].get("value"))

                # If no file found, put the argument at None
                else:
                    arg_dict[j] = None
        # print("Arguments to pass : "******"Class name : " + str(i))
        # Loop through all arguments of the convert of the corresponding class
        for j in home_data[i]:
            filtered_list = []
            # If value if found it means the argument is not a file but a string/int/etc
            if home_data[i][j].get("value") and not home_data[i][j].get("extension") and \
                    (not home_data[i][j].get("keyword") or not home_data[i][j].get("keyword_to_exclude")):
                print(home_data[i][j].get("value")[0])
                arg_dict[j] = home_data[i][j].get("value")[0]
            else:
                # If no extension is provided it means we are looking for a directory, so we filter the list of files and
                # directory to only contain directories
                if not home_data[i][j].get("extension"):
                    filtered_list = [file for file in files if "." not in file]
                # Filter the file list depending on the extension provided in the YAML file
                else:
                    for extension in home_data[i][j].get("extension"):
                        filtered_list.extend([file for file in files if extension in file])
                    # print("Filter result : " + str(filtered_list) + " by extension : " +
                          # str(home_data[i][j].get("extension")))
                # Conditional loop to remove all files or directories not containing the keywords
                # or containing excluded keywords
                counter = 0
                while counter < len(filtered_list):
                    delete = False
                    for keyword in home_data[i][j].get("keyword"):
                        if keyword not in filtered_list[counter]:
                            # print("Keyword not found in : " + str(filtered_list))
                            del filtered_list[counter]
                            # print("New list : " + str(filtered_list))
                            delete = True
                    if not delete:
                        for keyword_to_exclude in home_data[i][j].get("keyword_to_exclude"):
                            if keyword_to_exclude in filtered_list[counter]:
                                # print("Excluded keyword found in : " + str(filtered_list))
                                del filtered_list[counter]
                                # print("New list : " + str(filtered_list))
                                delete = True
                    if not delete:
                        counter += 1
                print("Files to pass for " + j + ": " + str(filtered_list))
                # If files were found respecting every element, add the whole path to pass them as arguments
                if filtered_list:
                    arg_dict[j] = os.path.join(dir_path, filtered_list[0])
                    if "mat" in home_data[i][j].get("extension") and home_data[i][j].get("value"):
                        arg_dict[j] = [arg_dict[j]] + list(home_data[i][j].get("value"))

                # If no file found, put the argument at None
                else:
                    arg_dict[j] = None

        #print("Arguments to pass : "******".nwb"
    with test_cicada_test_paul.NWBHDF5IO(os.path.join(dir_path, nwb_name), 'w') as io:
        io.write(nwb_file)

    print("NWB file created at : " + str(os.path.join(dir_path, nwb_name)))
Exemple #17
0
def get_file_name(url, gdrive):
	if gdrive:
		gdrive_id = utils.get_gdrive_id(url)
		file_name = gdrive_id + ".pdf"
		return file_name
	return utils.path_leaf(url)
Exemple #18
0
def main(args):
    """
    (args) -> SQLitedb
    takes number of arguments and produces ChronQC SQLite database
    """
    if args.mode == 'update' and not args.db:
        print("can't update database {} without a -db argument".format(args.mode))
    elif args.mode == 'update' and args.prefix:
        print("can't use prefix in update mode so ignoring it")
    elif args.mode == 'create' and not args.output:
        print("provide output directory --output argument for creating db".format(args.mode))

# output dir and file
    # Get output directory 1. user defined 2. db dir 3. multiqc_stats dir
    # Get output file name prefix and out file name
    multiqc_stats = op.abspath(args.mstats)
    if args.db is not None:
        output_directory, output_prefix = utils.path_leaf(args.db)
        out_file = op.abspath(args.db)
    else:
        output_directory = op.abspath(args.output)
        output_directory = op.join(output_directory, "chronqc_db")
        if op.exists(output_directory) and not args.force:
            print("FATAL: Output directory {0} already exists, use -f to overwrite".format(output_directory))
            sys.exit(1)
        elif op.exists(output_directory) and args.force:
            pass
        if not op.exists(output_directory):
            os.makedirs(output_directory)
        output_prefix = '{0}.{1}'.format(args.prefix, 'chronqc.stats.sqlite') if args.prefix is not None else '{0}'.format('chronqc.stats.sqlite')
        out_file = op.join(output_directory, output_prefix)
        output_cols = '{0}.{1}'.format(args.prefix, 'chronqc.stats.cols.txt') if args.prefix is not None else '{0}'.format('chronqc.stats.cols.txt')
        out_cols = op.join(output_directory, output_cols)
        output_json = '{0}.{1}'.format(args.prefix, 'chronqc.default.json') if args.prefix is not None else '{0}'.format('chronqc.default.json')
        out_json = op.join(output_directory, output_json)

# create logger
    log_file = op.join(output_directory, 'chronqc_stats.log')
    logging.basicConfig(filename=log_file,
                        format='%(asctime)s - %(name)s - %(levelname)s \
                        - %(message)s')
    logger = logging.getLogger('chronqc')
    logger.setLevel(logging.DEBUG)

    # create console handler and set level to debug
    ch = logging.StreamHandler()
    ch.setLevel(logging.CRITICAL)

    # create formatter
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -'
                                  '%(message)s')

    # add formatter to ch
    ch.setFormatter(formatter)

    # add ch to logger
    logger.addHandler(ch)
    logger.info("Started ChronQC {} SQLite db generation".format(out_file))

# Get parameters
    table_name = args.db_table
    table_name = table_name if table_name is not None else 'chronqc_stats_data'
    panel = args.panel
    if args.run_dir_level is not None:
        run_dir_level = int(args.run_dir_level)
    logger.info("Got required parameters for generating ChronQC SQLite db")
    utils.print_progress(1, 4, prefix='Running ChronQC', decimals=1, bar_length=50)
# Get run id and date info
    if args.run_date_info is not None:
        run_date_info = op.abspath(args.run_date_info)
        df_run = pd.read_csv(run_date_info, comment='#', chunksize=1000,
                             low_memory=False, iterator=True)
        df_run = pd.concat(list(df_run), ignore_index=True)
        logger.info("Generated run and date information from {0}".format(run_date_info))
    else:
        multiqc_sources = op.abspath(args.multiqc_sources)
        df_run = pd.read_csv(multiqc_sources, sep='\t', comment='#',
                             chunksize=1000, low_memory=False, iterator=True)
        df_run = pd.concat(list(df_run), ignore_index=True)
        df_run.rename(columns={'Sample Name': 'Sample'}, inplace=True)
        df_run_m = df_run.loc[df_run.Module.str.contains(args.module)]
        # stick to defined module (FASTQC)
        df_run = df_run_m.copy() if len(df_run_m) > 0 else df_run.copy()
        # Get date and run
        df_run['Source_path'] = df_run['Source'].apply(op.abspath)
        df_run['Date'] = df_run['Source_path'].apply(creation_date)
        multiqc_sources_time = time.localtime(op.getmtime(multiqc_sources))
        multiqc_sources_time = time.strftime("%m/%d/%Y", multiqc_sources_time)
        df_run['Date'].fillna(multiqc_sources_time, inplace=True)
        df_run['Run'] = df_run.apply(lambda row: updir(row['Source_path'], run_dir_level), axis=1)
        logger.info("Generated run and date information from {0}".format(multiqc_sources))
    df_run['Date'] = pd.to_datetime(df_run.Date, dayfirst=True)
    df_run.sort_values(by=['Date'], inplace=True)
    df_run.drop_duplicates(subset=['Sample'], inplace=True)
    if len(df_run) == 0:
        logger.critical("FATAL: For run and date information no records found")
        sys.exit(1)
    utils.print_progress(2, 4, prefix='Running ChronQC', decimals=1, bar_length=50)
# Read multiqc_stats
    df = pd.read_csv(multiqc_stats, sep='\t', comment='#', chunksize=1000,
                     low_memory=False, iterator=True)
    df = pd.concat(list(df), ignore_index=True)
    logger.info("Got {0} records from {1} for ChronQC SQLite db generation".format(len(df), multiqc_stats))
    if len(df) == 0:
        logger.critical("FATAL: No records found in {0}".format(multiqc_stats))
        sys.exit(1)
    utils.print_progress(3, 4, prefix='Running ChronQC', decimals=1, bar_length=50)

# Read config and get default parameters
    #sdir = op.dirname(op.abspath('__file__'))
    sdir = op.abspath(op.join(op.dirname(__file__), 'config'))
    config_file = op.join(sdir, 'chronqc.conf')
    Config.read(config_file)
    # [ignore_columns]
    ignore_columns = Config.get('ignore_columns', 'columns').split(',')
    ignore_columns = [s.strip() for s in ignore_columns]
    # [time_series_with_box_whisker_plot]
    box_whisker_plot = Config.get('time_series_with_box_whisker_plot', 'columns').split(',')
    box_whisker_plot = [s.strip() for s in box_whisker_plot]
    # [time_series_with_mean_and_stdev]
    mean_and_stdev = Config.get('time_series_with_mean_and_stdev', 'columns').split(',')
    mean_and_stdev = [s.strip() for s in mean_and_stdev]
    # [time_series_with_absolute_threshold]
    absolute_threshold_c = Config.get('time_series_with_absolute_threshold', 'columns').split(',')
    absolute_threshold_c = [s.strip() for s in absolute_threshold_c]
#    absolute_threshold = Config.get('time_series_with_absolute_threshold', 'threshods').split(',')
#    absolute_threshold = [int(s.strip()) for s in absolute_threshold]
    # [time_series_with_percentage_of_samples_above_threshold]
    percentage_of_samples_above_threshold_c = Config.get('time_series_with_percentage_of_samples_above_threshold', 'columns').split(',')
    percentage_of_samples_above_threshold_c = [s.strip() for s in percentage_of_samples_above_threshold_c]
#    percentage_of_samples_above_threshold = Config.get('time_series_with_percentage_of_samples_above_threshold', 'threshods').split(',')
#    percentage_of_samples_above_threshold = [int(s.strip()) for s in percentage_of_samples_above_threshold]
    # [time_series_with_percentage_category]
    percentage_category_c = Config.get('time_series_with_percentage_category', 'columns').split(',')
    percentage_category_c = [s.strip() for s in percentage_category_c]
#    percentage_category = Config.get('time_series_with_percentage_category', 'categories').split(',')
#    percentage_category = [s.strip() for s in percentage_category]
    logger.info("Finished reading parameters from config file for generating \
                chronqc db and json")

# remove unwanted columns
    cols = [col for col in list(df.columns)]
    cols = ['Sample'] + sorted(cols[1:])
    df = pd.DataFrame(df, columns=cols)

# process df for adding in to chronqc db
    # Add panel
    df['Panel'] = panel
    # Add run and date information
    df = pd.merge(df_run, df, left_on='Sample', right_on='Sample', how='inner')
    if len(df) == 0:
        logger.critical("FATAL: Run ID's do not match the sample information in {0}".format(multiqc_stats))
        sys.exit(1)
    df['Date'] = pd.to_datetime(df.Date, dayfirst=True)
    # remove blank spaces in column names
    df.columns = [x.strip().replace(' ', '_') for x in df.columns]
    logger.info("Kept {0} records after merging run, date and stats for ChronQC SQLite db".format(len(df)))
# convert boolean types This method will not work for object type column
#    booleandf = df.select_dtypes(include=[bool])
#    booleanDictionary = {True: 'TRUE', False: 'FALSE'}
#    for column in booleandf:
#        df[column] = df[column].map(booleanDictionary)        
    df.replace(to_replace=True, value='TRUE', inplace=True)
    df.replace(to_replace=False, value='FALSE', inplace=True)
# write db
    cnx = sqlite3.connect(out_file)
    if args.mode == 'create':
        df.to_sql(table_name, cnx, index=False, if_exists='replace', chunksize = 1000)
        out_cols = open(out_cols, 'w')
        for item in list(df.columns):
            out_cols.write("%s\n" % item)
        out_cols.close()
        # create default JSON file
        # only numeric columns
        df_num = df._get_numeric_data()
        num_cols = list(df_num)
        #############################################################
        default_json = []
        # absolute_threshold
        absolute_threshold_c = [c for c in absolute_threshold_c if c not in ignore_columns]
        absolute_threshold_c = [c for c in absolute_threshold_c if c in num_cols]
        abst = '{{"table_name": "{0}", "chart_type": "time_series_with_absolute_threshold", "chart_properties": {{"y_value": "{1}",  "lower_threshold": 30}}}}'
        for col in absolute_threshold_c:
            default_json.append(json.loads(abst.format(table_name, col)))  
        # percentage_of_samples_above_threshold
        percentage_of_samples_above_threshold_c = [c for c in percentage_of_samples_above_threshold_c if c not in ignore_columns]
        percentage_of_samples_above_threshold_c = [c for c in percentage_of_samples_above_threshold_c if c in num_cols]
        pctth = '{{"table_name": "{0}", "chart_type": "time_series_with_percentage_of_samples_above_threshold", "chart_properties": {{"y_value": "{1}",  "threshold": 30}}}}'
        for col in percentage_of_samples_above_threshold_c:
            default_json.append(json.loads(pctth.format(table_name, col)))  
        # percentage_category
        percentage_category_c = [c for c in percentage_category_c if c not in ignore_columns]
        percentage_category_c = [c for c in percentage_category_c if c in num_cols]
        pctcat = '{{"table_name": "{0}", "chart_type": "time_series_with_percentage_category", "chart_properties": {{"y_value": "{1}",  "category": "TRUE"}}}}'
        for col in percentage_category_c:
            default_json.append(json.loads(pctcat.format(table_name, col))) 
        # mean_and_stdev
        mean_and_stdev = [c for c in mean_and_stdev if c not in ignore_columns]
        mean_and_stdev = [c for c in mean_and_stdev if c in num_cols]
        if 'QualiMap_percentage_aligned' and 'Bamtools_mapped_reads_pct' in mean_and_stdev:
            mean_and_stdev.remove('Bamtools_mapped_reads_pct')
        if 'FastQC_percent_gc' and 'QualiMap_avg_gc' in mean_and_stdev:
            mean_and_stdev.remove('QualiMap_avg_gc')
        if 'QualiMap_mapped_reads' and 'Samtools_Flagstat_mapped_passed' in mean_and_stdev:
            mean_and_stdev.remove('Samtools_Flagstat_mapped_passed')
        if 'FastQC_total_sequences' and 'QualiMap_total_reads' in mean_and_stdev:
            mean_and_stdev.remove('QualiMap_total_reads')            
        mstd = '{{"table_name": "{0}", "chart_type": "time_series_with_mean_and_stdev", "chart_properties": {{"y_value": "{1}"}}}}'
        for col in mean_and_stdev:
            default_json.append(json.loads(mstd.format(table_name, col)))  
        # box_whisker_plot
        box_whisker_plot = [c for c in box_whisker_plot if c not in ignore_columns]
        box_whisker_plot = [c for c in box_whisker_plot if c in num_cols]
        bwp = '{{"table_name": "{0}", "chart_type": "time_series_with_box_whisker_plot", "chart_properties": {{"y_value": "{1}"}}}}'
        for col in box_whisker_plot:
            default_json.append(json.loads(bwp.format(table_name, col)))  
        # remaining cols 
        
        num_cols = [c for c in num_cols if c not in ignore_columns]
        num_cols = [c for c in num_cols if c not in box_whisker_plot]
        num_cols = [c for c in num_cols if c not in mean_and_stdev]
        num_cols = [c for c in num_cols if c not in absolute_threshold_c]
        num_cols = [c for c in num_cols if c not in percentage_of_samples_above_threshold_c]
        num_cols = [c for c in num_cols if c not in percentage_category_c]
        if len(num_cols) > 0:
            for col in num_cols:
                default_json.append(json.loads(mstd.format(table_name, col)))  
        with open(out_json, 'w') as out_json_file:
            json.dump(default_json, out_json_file, sort_keys = False, indent = 4,
               ensure_ascii = False) 
        logger.info("Created ChronQC db: {0} with {1} records".format(out_file, len(df)))
        logger.info("Created ChronQC default JSON file: {0}. Customize the JSON as needed before generating ChronQC plots.".format(out_json))
    elif args.mode == 'update':
        df.to_sql(table_name, cnx, index=False, if_exists='append', chunksize = 1000)
        logger.info("Updated ChronQC db: {0} with {1} records".format(out_file, len(df)))
    cnx.close()
    utils.print_progress(4, 4, prefix='Running ChronQC', decimals=1, bar_length=50)
    if args.mode == 'create':
        print("Created ChronQC db: {0} with {1} records".format(out_file, len(df)))
        print("Created ChronQC default JSON file: {0}. Customize the JSON as needed before generating ChronQC plots.".format(out_json))
    elif args.mode == 'update':
        print("Updated ChronQC db: {0} with {1} records".format(out_file, len(df)))
Exemple #19
0
    plt.hist(data, range=(0, np.percentile(data, percentile)))

    plt.title(title)
    plt.ylabel('Count')
    plt.xlabel('Ticks')
    plt.xticks(rotation=45)

    # save out
    plt.savefig(os.path.join(output_dir, title), dpi=300, bbox_inches='tight')
    plt.close()


if __name__ == "__main__":

    # get data dir path from argv
    if (len(sys.argv) != 2):
        sys.exit(1)

    data_dir = sys.argv[1]
    if (not os.path.exists(data_dir) or not os.path.isdir(data_dir)):
        print("path '" + data_dir + "' is not a valid directory!")
        sys.exit(1)

    path = os.path.join(data_dir, "*B.csv")

    #Spread out the values across the histogram by taking out the top 99.5 percentile (removing high outliers)
    for fname in glob.glob(path):
        data = np.loadtxt(fname, delimiter=",", dtype=np.uint64)
        buffer_name = path_leaf(fname)
        plot_and_save(data, buffer_name, IMAGES_DIR, 100)
        plot_and_save(data, buffer_name, IMAGES_ADJUSTED_DIR, 99.5)
Exemple #20
0
def main(args):

    print('Feature extractor training.')
    print('CONFIGURATION:\t{}'.format(args.config))
    with open(args.config) as json_config_file:
        config = utils.AttrDict(json.load(json_config_file))

    # Set up output directory
    experiment_name = generate_experiment_name(config)
    model_dir = os.path.join(os.path.expanduser(config.output.output_dir), experiment_name)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    print('Model saved at {}'.format(model_dir))

    config_filename = utils.path_leaf(args.config)
    copyfile(args.config, os.path.join(model_dir, config_filename))

    # CUDA for PyTorch
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    # device = torch.device("cpu")

    # Get dataloaders
    train_loader = dataloaders.get_traindataloaders(config.train_dataset,
                                                    config)
    evaluators_list = dataloaders.get_evaluators(config.evaluation_datasets,
                                                config)

    # Set up training model
    print('Building training model')
    if config.model.checkpoint:
        checkpoint_path = config.model.checkpoint_path
    else:
        checkpoint_path = None
    model = models.load_model(config.model.model_arch,
                              device,
                              checkpoint_path=checkpoint_path,
                              embedding_size=config.model.embedding_size,
                              imgnet_pretrained=config.model.pretrained_imagenet)

    optimizer = optim.SGD(model.parameters(), lr=config.hyperparameters.learning_rate, momentum=0.9, nesterov=True, weight_decay=2e-4)

    # scheduler = lr_scheduler.StepLR(optimizer, 5, gamma=0.1)
    # scheduler = lr_scheduler.ExponentialLR(optimizer, config.hyperparameters.learning_rate_decay_factor)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.hyperparameters.n_epochs, eta_min=1e-6)

    plotter = utils.VisdomPlotter(config.visdom.server, env_name=experiment_name, port=config.visdom.port)

    miner = miners.FunctionSemihardTripletSelector(config.hyperparameters.margin, plotter)

    loss = nn.TripletMarginLoss(config.hyperparameters.margin, swap=config.hyperparameters.triplet_swap)

    my_trainer = trainer.Triplet_Trainer(model,
                                         miner,
                                         loss,
                                         optimizer,
                                         scheduler,
                                         device,
                                         plotter,
                                         config.hyperparameters.margin,
                                         config.model.embedding_size,
                                         evaluation.pair_evaluate,
                                         batch_size=config.hyperparameters.batch_size)

    # Loop over epochs
    epoch = 0
    print('Training Launched.')
    while epoch < config.hyperparameters.n_epochs:

        # Validation
        for evaluator in evaluators_list:
            print('\nEvaluation on {}'.format(evaluator.test_name))
            evaluator.evaluate(model,
                               device,
                               plotter=plotter,
                               epoch=epoch)

        # Training
        print('\nTrain Epoch {}'.format(epoch))
        my_trainer.Train_Epoch(train_loader, epoch)

        # Save model
        if not (epoch + 1) % config.output.save_interval:
            model_file_path = os.path.join(model_dir, 'model_{}.pth'.format(epoch))
            print('\nSave model at {}'.format(model_file_path))

            torch.save({'epoch': epoch,
                        'model_state_dict': utils.state_dict_to_cpu(model.state_dict()),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'scheduler_state_dict': scheduler.state_dict(),
                        'embedding_size': config.model.embedding_size,
                        }, model_file_path)

        epoch += 1

    # Final save.
    model_file_path = os.path.join(model_dir, 'model_{}.pth'.format(epoch))
    print('\nSave model at {}'.format(model_file_path))
    torch.save({'epoch': epoch,
                'model_state_dict': utils.state_dict_to_cpu(model.state_dict()),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'embedding_size': config.model.embedding_size,
                }, model_file_path)
    print('Finish.')

    return model