Esempio n. 1
0
def do_panel_output(args):
   #import schema_data.inputs as schema_data_inputs

   _validator = get_validator(files('schema_data.inputs').joinpath('panel.json'))
   _schema = _validator.schema

   wb = Workbook()
   default_names = wb.sheetnames
   wb.add_named_style(highlight)

   _oname = args.panel_output

   # Start with the Metadata. Write the header and the value names

   ws1 = wb.create_sheet(_schema['properties']['parameters']['title'])
   _write_parameters(ws1,_schema['properties']['parameters'])
   _fix_width(ws1)


   # Now lets make the Panel.  Write the header only.
   ws2 = wb.create_sheet(_schema['properties']['markers']['title'])
   _write_repeating(ws2,_schema['properties']['markers'])
   _fix_width(ws2)

   # cleanup workbook deleting default sheet name
   for _sheet_name in default_names:
      #print(_sheet_name)
      del wb[_sheet_name]
   wb.save(filename = _oname)
Esempio n. 2
0
def do_report_output(output_path):
   _validator = get_validator(files('schema_data.inputs').joinpath('report_definition.json'))
   _schema = _validator.schema
   wb = Workbook()
   default_names = wb.sheetnames
   wb.add_named_style(highlight)


   # Start with the Metadata. Write the header and the value names

   ws0 = wb.create_sheet(_schema['properties']['parameters']['title'])
   _write_parameters(ws0,_schema['properties']['parameters'])
   _fix_width(ws0)

   ws1 = wb.create_sheet(_schema['properties']['population_percentages']['title'])
   _write_repeating(ws1,_schema['properties']['population_percentages'])
   _fix_width(ws1)

   ws2 = wb.create_sheet(_schema['properties']['population_densities']['title'])
   _write_repeating(ws2,_schema['properties']['population_densities'])
   _fix_width(ws2)

   # cleanup workbook deleting default sheet name
   for _sheet_name in default_names:
      #print(_sheet_name)
      del wb[_sheet_name]
   wb.save(filename = output_path)
   return
Esempio n. 3
0
def do_project_folder_output(output_file):
   # For now lets keep this with InForm only
   _validator = get_validator(files('schema_data.inputs.platforms.InForm').joinpath('project.json'))
   _schema = _validator.schema

   wb = Workbook()
   default_names = wb.sheetnames
   wb.add_named_style(highlight)

   # Start with the Metadata. Write the header and the value names

   ws1 = wb.create_sheet(_schema['properties']['parameters']['title'])
   _write_parameters(ws1,_schema['properties']['parameters'])
   _fix_width(ws1)


   # Now lets make the Panel.  Write the header only.
   ws2 = wb.create_sheet(_schema['properties']['samples']['title'])
   _write_repeating(ws2,_schema['properties']['samples'])
   _fix_width(ws2)

   # cleanup workbook deleting default sheet name
   for _sheet_name in default_names:
      #print(_sheet_name)
      del wb[_sheet_name]
   wb.save(filename = output_file)
Esempio n. 4
0
def do_analysis_output(output_file):
   _validator1 = get_validator(files('schema_data.inputs').joinpath('panel.json'))
   _validator2 = get_validator(files('schema_data.inputs.platforms.InForm').joinpath('analysis.json'))
   _schema1 = _validator1.schema
   _schema2 = _validator2.schema

   #_schema1 = json.loads(files('schema_data.inputs').joinpath('panel.json').read_text())
   #_schema2 = json.loads(files('schema_data.inputs.platforms.InForm').joinpath('analysis.json').read_text())

   wb = Workbook()
   default_names = wb.sheetnames
   wb.add_named_style(highlight)


   # Start with the Metadata. Write the header and the value names

   ws0 = wb.create_sheet(_schema2['properties']['parameters']['title'])
   _write_parameters(ws0,[_schema1['properties']['parameters'],_schema2['properties']['parameters']])
   _fix_width(ws0)

   ws1 = wb.create_sheet(_schema1['properties']['markers']['title'])
   _write_repeating(ws1,_schema1['properties']['markers'])
   _fix_width(ws1)

   ws2 = wb.create_sheet(_schema2['properties']['inform_exports']['title'])
   _write_repeating(ws2,_schema2['properties']['inform_exports'])
   _fix_width(ws2)

   ws3 = wb.create_sheet(_schema2['properties']['mutually_exclusive_phenotypes']['title'])
   _write_repeating(ws3,_schema2['properties']['mutually_exclusive_phenotypes'])
   _fix_width(ws3)

   ws4 = wb.create_sheet(_schema2['properties']['binary_phenotypes']['title'])
   _write_repeating(ws4,_schema2['properties']['binary_phenotypes'])
   _fix_width(ws4)

   ws5 = wb.create_sheet(_schema2['properties']['regions']['title'])
   _write_repeating(ws5,_schema2['properties']['regions'])
   _fix_width(ws5)

   # cleanup workbook deleting default sheet name
   for _sheet_name in default_names:
      #print(_sheet_name)
      del wb[_sheet_name]
   wb.save(filename = output_file)
   return
Esempio n. 5
0
def excel_to_json(
        excel_template_path,
        json_schema_path,
        sheet_names,
        parameter_sheet="Parameters",
        #do_parameters=True,
        ignore_extra_parameters=True):
    """
    Read the analysis data from a filled-in template file
    into a json object compatible with its respective json-schema.
    Return back the json object or None, whether its valid or not, and any errors.
    """
    #_fname = files('schema_data.inputs.platforms.InForm').joinpath('analysis.json')
    _validator = get_validator(json_schema_path)
    wb = load_workbook(excel_template_path)

    # Create the object we will save the data in
    output = {}

    # Lets do the Parameters first
    _parameter_key, _parameters, parameters_success, parameters_errors = _read_parameters(
        "Parameters",
        wb,
        _validator.schema,
        ignore_extra_parameters=ignore_extra_parameters)
    output[_parameter_key] = _parameters

    #sheets = ['Exports','Mutually Exclusive Phenotypes','Binary Phenotypes','Regions']
    total_success = True
    total_errors = []
    for sheet in sheet_names:
        #print(sheet)
        # Lets do the Repeating fields next
        _repeating_key, _data, repeat_success, repeat_errors = _read_repeating(
            sheet, wb, _validator.schema)
        total_success = total_success and repeat_success
        total_errors += repeat_errors
        output[_repeating_key] = _data

    pass_validation = True
    try:
        _validation = _validator.validate(instance=output)
    except:
        pass_validation = False
        raise

    analysis_success = total_success and pass_validation
    if not analysis_success: output = None
    return output, \
           analysis_success, \
           total_errors
def create_valid_schema_format_test(filename, schemas_dir):
    # Pre: Take a schema filename
    # Post: Return a test that passes as True is True when the schema file is properly formatted json-schema
    result = False
    data = None
    try:
        if os.path.exists(filename):
            with open(filename,'rt') as f:
                data = f.read()
                data = json.loads(data)
                result = True
        validator = get_validator(filename,schemas_dir)
        result = True
    except:
        result = False
    def do_test(self):
        self.assertTrue(result)
    return do_test
def create_validated_example_test(example_filename, schema_filename, schemas_dir):
    # Pre: Take the example schema, the schema, and the schemas_dir 
    # Post: Return a test that passes as True is True when the example is a valid version of the Schema
    result = False
    data = None
    try:
        if os.path.exists(example_filename) and os.path.exists(schema_filename):
            with open(schema_filename,'rt') as f:
                data = f.read()
                data = json.loads(data)
            validator = get_validator(schema_filename,schemas_dir)
            with open(example_filename,'rt') as f:
                data = f.read()
                data = json.loads(data)
            result = True
            try:
                 validator.validate(data)
            except:
                result = False
    except:
        result = False
    def do_test(self):
        self.assertTrue(result)
    return do_test
Esempio n. 8
0
a. Make sure the project directy exists
b. Make sure only there are only folders among the visible files
c. Make sure the visible folders are only exactly named as those among the sample manifest
d. Make sure within each sample folder that all expected Exports exist
e. Make sure there aren't more folders burried deeper in these .. only files
f. Make sure all files present start with a prefix of the sample whose folder they are in that is followed by an underscore
g. Make sure there are not image frames that do not have a cell seg data since this is how we find what ROIs we have
h. Make sure that the required images for each annotation strategy are present

"""
import os, re, time, stat, hashlib, logging
from importlib_resources import files
from pythologist_schemas import get_validator

# Lets preload our validators for relevent schemas as globals
project_schema_validator = get_validator(
    files('schema_data.inputs.platforms.InForm').joinpath('project.json'))
analysis_schema_validator = get_validator(
    files('schema_data.inputs.platforms.InForm').joinpath('analysis.json'))
files_schema_validator = get_validator(
    files('schema_data.inputs.platforms.InForm').joinpath('files.json'))


def injest_project(project_json, analysis_json, project_directory):
    """
    Read a path pointing to multiple InForm sample folders

    Args:
        project_json (dict): The json object as a valid project schema
        analysis_json (dict): The json object as a valid analysis schema
    Returns:
        samples (list): A list of json objects for the samples in the project
Esempio n. 9
0
def main(args):
    "We need to take the platform and return an appropriate input template"
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)
    if args.cache_directory:
        if not os.path.exists(args.cache_directory):
            os.makedirs(args.cache_directory)
        if not os.path.isdir(args.cache_directory):
            raise ValueError("cache directory not a directory")
    logger = logging.getLogger("start run")
    run_id = str(uuid.uuid4())
    logger.info("run_id " + run_id)

    inputs = json.loads(open(args.input_json, 'rt').read())

    # Lets start by checking our inputs
    logger.info("check project json format")
    get_validator(files('schema_data.inputs.platforms.InForm').joinpath('project.json')).\
        validate(inputs['project'])
    logger.info("check analysis json format")
    get_validator(files('schema_data.inputs.platforms.InForm').joinpath('analysis.json')).\
        validate(inputs['analysis'])
    logger.info("check report json format")
    get_validator(files('schema_data.inputs').joinpath('report.json')).\
        validate(inputs['report'])
    logger.info("check panel json format")
    get_validator(files('schema_data.inputs').joinpath('panel.json')).\
        validate(inputs['panel'])
    _validator = get_validator(
        files('schema_data.inputs.platforms.InForm').joinpath('files.json'))
    for sample_input_json in inputs['sample_files']:
        logger.info("check sample files json format " +
                    str(sample_input_json['sample_name']))
        _validator.validate(sample_input_json)

    # Now lets step through sample-by-sample executing the pipeline
    output = {
        'run_id':
        run_id,
        'time':
        str(datetime.now()),
        'project_name':
        inputs['project']['parameters']['project_name'],
        'report_name':
        inputs['report']['parameters']['report_name'],
        'report_version':
        inputs['report']['parameters']['report_version'],
        'analysis_name':
        inputs['analysis']['parameters']['analysis_name'],
        'analysis_version':
        inputs['analysis']['parameters']['analysis_version'],
        'panel_name':
        inputs['panel']['parameters']['panel_name'],
        'panel_version':
        inputs['panel']['parameters']['panel_version'],
        'sample_outputs': [
            execute_sample(x,
                           inputs,
                           run_id,
                           verbose=args.verbose,
                           cache_directory=args.cache_directory)
            for x in inputs['sample_files']
        ]
    }
    logger.info("Finished reading creating output. Validate output format.")
    _validator = get_validator(
        files('schema_data').joinpath('report_output.json'))
    _validator.validate(output)
    logger.info("Validated output schema against schema")
    #logger.info(str(output['sample_outputs']))
    if args.output_json:
        with open(args.output_json, 'wt') as of:
            of.write(json.dumps(output, allow_nan=False))
    return
def main(args):
    "We need to take the platform and return an appropriate input template"
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)
    logger = logging.getLogger("report extraction")
    report = json.loads(open(args.report_json, 'rt').read())
    logger.info("check report json format")
    get_validator(files('schema_data').joinpath('report_output.json')).\
        validate(report)
    logger.info("report json validated")

    # Lets start formulating the dataframes

    # cache these in arrays here but concatonate to dataframes the end
    sheets = OrderedDict({
        'smp_cnt_cumulative_lf': [],
        'smp_cnt_aggregate_lf': [],
        'smp_pct_cumulative_lf': [],
        'smp_pct_aggregate_lf': [],
        'img_cnt_lf': [],
        'img_pct_lf': []
    })
    info = {}
    for sample in report['sample_outputs']:
        sample_name = sample['sample_name']
        _df = pd.DataFrame([
            row for row in sample['sample_reports']
            ['sample_cumulative_count_densities']
        ])
        _df['sample_name'] = sample_name
        sheets['smp_cnt_cumulative_lf'].append(_df)
        info['smp_cnt_cumulative_lf'] = {
            'index':
            False,
            'description':
            'sample-level count density measurements treating all ROIs as a single large image in long table format.'
        }

        _df = pd.DataFrame([
            row for row in sample['sample_reports']
            ['sample_aggregate_count_densities']
        ])
        _df['sample_name'] = sample_name
        sheets['smp_cnt_aggregate_lf'].append(_df)
        info['smp_cnt_aggregate_lf'] = {
            'index':
            False,
            'description':
            'sample-level count density measurements averaging the measures from ROIs in long table format.'
        }

        _df = pd.DataFrame([
            row for row in sample['sample_reports']
            ['sample_cumulative_count_percentages']
        ])
        _df['sample_name'] = sample_name
        sheets['smp_pct_cumulative_lf'].append(_df)
        info['smp_pct_cumulative_lf'] = {
            'index':
            False,
            'description':
            'sample-level percentage measurements treating all ROIs as a single large image in long table format.'
        }

        _df = pd.DataFrame([
            row for row in sample['sample_reports']
            ['sample_aggregate_count_percentages']
        ])
        _df['sample_name'] = sample_name
        sheets['smp_pct_aggregate_lf'].append(_df)
        info['smp_pct_aggregate_lf'] = {
            'index':
            False,
            'description':
            'sample-level percentage measurements averaging the measures from ROIs in long table format.'
        }

        # Now get the images
        for image in sample['images']:
            image_name = image['image_name']
            _df = pd.DataFrame([
                row for row in image['image_reports']['image_count_densities']
            ])
            _df['sample_name'] = sample_name
            _df['image_name'] = image_name
            sheets['img_cnt_lf'].append(_df)
            info['img_cnt_lf'] = {
                'index':
                False,
                'description':
                'image-level count density measurements in long table format.'
            }

            _df = pd.DataFrame([
                row
                for row in image['image_reports']['image_count_percentages']
            ])
            _df['sample_name'] = sample_name
            _df['image_name'] = image_name
            sheets['img_pct_lf'].append(_df)
            info['img_pct_lf'] = {
                'index':
                False,
                'description':
                'image-level percentage measurements in long table format.'
            }

    # move the sheets to dataframes
    for sheet_name in sheets:
        sheets[sheet_name] = pd.concat(sheets[sheet_name])


    _nums = sheets['img_pct_lf'][['sample_name','image_name']].drop_duplicates().sort_values('image_name').reset_index(drop=True).\
        groupby('sample_name').apply(lambda x: pd.Series(dict(zip(
            x['image_name'],
            [y+1 for y in range(0,len(x['image_name']))]
        )))).reset_index().rename(columns={'level_1':'image_name',0:'idx'})
    _df = _nums.merge(sheets['img_pct_lf'],on=['sample_name','image_name']).set_index(['region_name','population_name','sample_name'])[['idx','percent']].\
        pivot(columns='idx')
    _df.columns = [str(x) for x in _df.columns.droplevel(0)]
    sheets = _prepend(sheets, 'img_pct_mat', _df)
    info['img_pct_mat'] = {
        'index': True,
        'description': 'image-level percentage measurement in matrix format.'
    }

    _nums = sheets['img_cnt_lf'][['sample_name','image_name']].drop_duplicates().sort_values('image_name').reset_index(drop=True).\
        groupby('sample_name').apply(lambda x: pd.Series(dict(zip(
            x['image_name'],
            [y+1 for y in range(0,len(x['image_name']))]
        )))).reset_index().rename(columns={'level_1':'image_name',0:'idx'})
    _df = _nums.merge(sheets['img_cnt_lf'],on=['sample_name','image_name']).set_index(['region_name','population_name','sample_name'])[['idx','density_mm2']].\
        pivot(columns='idx')
    _df.columns = [str(x) for x in _df.columns.droplevel(0)]
    sheets = _prepend(sheets, 'img_cnt_mat', _df)
    info['img_cnt_mat'] = {
        'index': True,
        'description':
        'image-level count density measurement in matrix format.'
    }

    # Add some classic style reports
    _df = sheets['smp_pct_cumulative_lf'].\
        set_index(['region_name','sample_name','image_count'])[['cumulative_percent','population_name']].\
        pivot(columns='population_name')
    sheets = _prepend(sheets, 'smp_pct_cumulative_mat', _df)
    info['smp_pct_cumulative_mat'] = {
        'index':
        True,
        'description':
        'sample-level percentage measurements treating all ROIs as a single large image in matrix format.'
    }

    # Add some classic style reports
    _df = sheets['smp_pct_aggregate_lf'].rename(columns={'aggregate_mean_percent':'mean_percent','aggregate_stderr_percent':'stderr_percent'}).\
        set_index(['region_name','sample_name','image_count'])[['mean_percent','stderr_percent','population_name']].\
        pivot(columns='population_name')
    _df = _df.swaplevel(axis=1).sort_index(1)
    sheets = _prepend(sheets, 'smp_pct_aggregate_mat', _df)
    info['smp_pct_aggregate_mat'] = {
        'index':
        True,
        'description':
        'sample-level percentage measurements averaging the measures from ROIs in matrix format.'
    }

    # Add some classic style reports
    _df = sheets['smp_cnt_cumulative_lf'].\
        set_index(['region_name','sample_name','image_count'])[['cumulative_density_mm2','population_name']].\
        pivot(columns='population_name')
    sheets = _prepend(sheets, 'smp_cnt_cumulative_mat', _df)
    info['smp_cnt_cumulative_mat'] = {
        'index':
        True,
        'description':
        'sample-level count density measurements treating all ROIs as a single large image in matrix format.'
    }

    # Add some classic style reports
    _df = sheets['smp_cnt_aggregate_lf'].rename(columns={'aggregate_mean_density_mm2':'mean_density_mm2','aggregate_stderr_density_mm2':'stderr_density_mm2'}).\
        set_index(['region_name','sample_name','image_count'])[['mean_density_mm2','stderr_density_mm2','population_name']].\
        pivot(columns='population_name')
    _df = _df.swaplevel(axis=1).sort_index(1)
    sheets = _prepend(sheets, 'smp_cnt_aggregate_mat', _df)
    info['smp_cnt_aggregate_mat'] = {
        'index':
        True,
        'description':
        'sample-level count density measurements averaging the measures from ROIs in matrix format.'
    }

    writer = pd.ExcelWriter(args.output_excel, engine='xlsxwriter')

    for sheetname, df in sheets.items():  # loop through `dict` of dataframes
        df.to_excel(writer,
                    sheet_name=sheetname,
                    float_format="%.2f",
                    index=info[sheetname]['index'])  # send df to writer
        worksheet = writer.sheets[sheetname]  # pull worksheet object
        for idx, clen in enumerate([20]*(df.index.nlevels if info[sheetname]['index'] else 0)+\
                                   [len(x)+1 if isinstance(x,str) else max([len(y) for y in x])+1 for x in df.columns]):  # loop through all columns
            max_len = max(8, clen * 1.1)
            worksheet.set_column(idx, idx, max_len)  # set column width

    writer.save()

    return
Esempio n. 11
0
import json
from importlib_resources import files
from pythologist_schemas import get_validator

report_definition_schema_validator = get_validator(
    files('schema_data.inputs').joinpath('report_definition.json'))
report_schema_validator = get_validator(
    files('schema_data.inputs').joinpath('report.json'))


def convert_report_definition_to_report(report_definition_json):
    """
    Take a report_definition and return a report in valid json format
    """
    report_definition_schema_validator.validate(report_definition_json)

    output = json.loads(
        json.dumps(report_definition_json)
    )  # get deep copy of the definition to modify.. parameters will come directly from that
    # Do the regions
    for i, measure in enumerate(output['region_selection']):
        # always should have at least one mutually exclusive phenotype so no worries about Nonetype
        output['region_selection'][i]['regions_to_combine'] = [
            x.strip() for x in measure['regions_to_combine'].split(',')
        ]

    # Start with the mutually exclusive phenotypes
    for i, measure in enumerate(output['population_densities']):
        # always should have at least one mutually exclusive phenotype so no worries about Nonetype
        output['population_densities'][i]['mutually_exclusive_phenotypes'] = [
            x.strip()