Python extract_fields_from_jsonの例、vidjil_utils.extract_fields_from_json Pythonの例

コード例 #1

0

ファイルを表示

ファイル: analysis_file.py プロジェクト: NikaAb/FaIR

def get_analysis_info(json_paths, sample_set_id):
    '''Return the information in the analysis files for the given patient
    under the provided json paths.
    '''
    analysis_file = get_analysis_from_sample_set(sample_set_id)
    results = []

    for analysis in analysis_file:
        filename=defs.DIR_RESULTS + analysis.analysis_file
        results.append(vidjil_utils.extract_fields_from_json(json_paths, None, filename))
    return results

コード例 #2

0

ファイルを表示

ファイル: analysis_file.py プロジェクト: vidjil/vidjil

def get_analysis_info(json_paths, sample_set_id):
    """Return the information in the analysis files for the given patient
    under the provided json paths.
    """
    analysis_file = get_analysis_from_sample_set(sample_set_id)
    results = []

    for analysis in analysis_file:
        filename = defs.DIR_RESULTS + analysis.analysis_file
        results.append(vidjil_utils.extract_fields_from_json(json_paths, None, filename))
    return results

コード例 #3

0

ファイルを表示

def stats():
    start = time.time()

    d = custom()

    stats_regex = [
        # found 771265 40-windows in 2620561 segments (85.4%) inside 3068713 sequences # before 1f501e13 (-> 2015.05)
        'in (?P<seg>\d+) segments \((?P<seg_ratio>.*?)\) inside (?P<reads>\d+) sequences',

        # found 10750 50-windows in 13139 reads (99.9% of 13153 reads)
        'windows in (?P<seg>\d+) reads \((?P<seg_ratio>.*?) of (?P<reads>\d+) reads\)',

        # segmentation causes
        'log.* SEG_[+].*?-> (?P<SEG_plus>.*?).n',
        'log.* SEG_[-].*?-> (?P<SEG_minus>.*?).n',
    ]

    # stats by locus
    for locus in defs.LOCUS:
        locus_regex = locus.replace('+', '[+]')
        locus_group = locus.replace('+', 'p')
        stats_regex += [
            'log.* %(locus)s.*?->\s*?(?P<%(locus_g)s_reads>\d+)\s+(?P<%(locus_g)s_av_len>[0-9.]+)\s+(?P<%(locus_g)s_clones>\d+)\s+(?P<%(locus_g)s_av_reads>[0-9.]+)\s*.n'
            % {
                'locus': locus_regex,
                'locus_g': locus_group
            }
        ]

    json_paths = {
        'result_file': {
            'main_clone': '/clones[0]/name',
            'main_clone_reads': '/clones[0]/reads[0]'
        },
        'fused_file': {
            'reads distribution [>= 10%]': 'reads/distribution/0.1',
            'reads distribution [>= 1% < 10%]': 'reads/distribution/0.01',
            'reads distribution [>= .01% < 1%]': 'reads/distribution/0.001',
            'reads distribution [>= .001% < .01%]':
            'reads/distribution/0.0001',
            'reads distribution [>= .0001% < .001%]':
            'reads/distribution/0.00001',
            'producer': 'samples/producer'
        }
    }

    keys_patient = ['info']
    keys_file = ['sampling_date', 'size_file']

    keys = []
    keys += keys_file
    keys += keys_patient

    regex = []
    for sr in stats_regex:
        r = re.compile(sr)
        regex += [r]
        keys += r.groupindex.keys()

    keys += sorted(json_paths['result_file'].keys() +
                   json_paths['fused_file'].keys())

    for row in d['query']:
        found = {}
        results_f = row.results_file.data_file
        row_result = vidjil_utils.search_first_regex_in_file(
            regex, defs.DIR_RESULTS + results_f, STATS_READLINES)
        try:
            row_result_json = vidjil_utils.extract_fields_from_json(
                json_paths['result_file'], None, defs.DIR_RESULTS + results_f,
                STATS_MAXBYTES)
        except:
            row_result_json = []

        fused_file = db((db.fused_file.sample_set_id == row.sample_set.id) & (
            db.fused_file.config_id == row.results_file.config_id)).select(
                orderby=~db.fused_file.id, limitby=(0, 1))
        if len(fused_file
               ) > 0 and fused_file[0].sequence_file_list is not None:
            sequence_file_list = fused_file[0].sequence_file_list.split('_')
            try:
                pos_in_list = sequence_file_list.index(
                    str(row.sequence_file.id))
                row_fused = vidjil_utils.extract_fields_from_json(
                    json_paths['fused_file'], pos_in_list,
                    defs.DIR_RESULTS + fused_file[0].fused_file,
                    STATS_MAXBYTES)
            except ValueError:
                row_fused = []
        else:
            row_fused = {}
        results_list = [row_result, row_result_json, row_fused]
        for key in keys:
            for map_result in results_list:
                if key in map_result:
                    row[key] = map_result[key]
                    found[key] = True
            if key not in found:
                if key in keys_patient:
                    row[key] = row.patient[key]
                    found[key] = True
                elif key in keys_file:
                    row[key] = row.sequence_file[key]
                    found[key] = True
                else:
                    row[key] = ''

    # Re-process some data
    keys += ['IGH_av_clones']
    for row in d['query']:
        row['IGH_av_clones'] = ''
        if 'IGH_av_reads' in row:
            try:
                row['IGH_av_clones'] = '%.4f' % (1.0 /
                                                 float(row['IGH_av_reads']))
                found['IGH_av_clones'] = True
            except:
                pass

    # Keep only non-empty columns
    d['stats'] = []
    for key in keys:
        if key in found:
            d['stats'] += [key]

    log.debug("patient/stats (%.3fs) %s" %
              (time.time() - start, request.vars["filter"]))
    return d

コード例 #4

0

ファイルを表示

def stats():
    start = time.time()

    d = custom()

    stats_regex = [
        # found 771265 40-windows in 2620561 segments (85.4%) inside 3068713 sequences # before 1f501e13 (-> 2015.05)
        'in (?P<seg>\d+) segments \((?P<seg_ratio>.*?)\) inside (?P<reads>\d+) sequences',

        # found 10750 50-windows in 13139 reads (99.9% of 13153 reads)
        'windows in (?P<seg>\d+) reads \((?P<seg_ratio>.*?) of (?P<reads>\d+) reads\)',

        # segmentation causes
        'log.* SEG_[+].*?-> (?P<SEG_plus>.*?).n',
        'log.* SEG_[-].*?-> (?P<SEG_minus>.*?).n',
    ]

    # stats by locus
    for locus in defs.LOCUS:
        locus_regex = locus.replace('+', '[+]')
        locus_group = locus.replace('+', 'p')
        stats_regex += [ 'log.* %(locus)s.*?->\s*?(?P<%(locus_g)s_reads>\d+)\s+(?P<%(locus_g)s_av_len>[0-9.]+)\s+(?P<%(locus_g)s_clones>\d+)\s+(?P<%(locus_g)s_av_reads>[0-9.]+)\s*.n'
                         % { 'locus': locus_regex, 'locus_g': locus_group } ]

    json_paths = {
        'result_file': {
            'main_clone': '/clones[0]/name',
            'main_clone_reads': '/clones[0]/reads[0]'
        },
        'fused_file': {
                  'reads distribution [>= 10%]': 'reads/distribution/0.1',
                  'reads distribution [>= 1% < 10%]': 'reads/distribution/0.01',
                  'reads distribution [>= .01% < 1%]': 'reads/distribution/0.001',
                  'reads distribution [>= .001% < .01%]': 'reads/distribution/0.0001',
                  'reads distribution [>= .0001% < .001%]': 'reads/distribution/0.00001',
                  'producer': 'samples/producer'
        }
    }

    keys_patient = [ 'info' ]
    keys_file = [ 'sampling_date', 'size_file' ]

    keys = []
    keys += keys_file
    keys += keys_patient

    regex = []
    for sr in stats_regex:
        r = re.compile(sr)
        regex += [r]
        keys += r.groupindex.keys()

    keys += sorted(json_paths['result_file'].keys() + json_paths['fused_file'].keys())

    for row in d['query']:
        found = {}
        results_f = row.results_file.data_file
        row_result = vidjil_utils.search_first_regex_in_file(regex, defs.DIR_RESULTS + results_f, STATS_READLINES)
        try:
            row_result_json = vidjil_utils.extract_fields_from_json(json_paths['result_file'], None, defs.DIR_RESULTS + results_f, STATS_MAXBYTES)
        except:
            row_result_json = []

        fused_file = db((db.fused_file.sample_set_id == row.sample_set.id) & (db.fused_file.config_id == row.results_file.config_id)).select(orderby = ~db.fused_file.id, limitby=(0,1))
        if len(fused_file) > 0 and fused_file[0].sequence_file_list is not None:
            sequence_file_list = fused_file[0].sequence_file_list.split('_')
            try:
                pos_in_list = sequence_file_list.index(str(row.sequence_file.id))
                row_fused = vidjil_utils.extract_fields_from_json(json_paths['fused_file'], pos_in_list, defs.DIR_RESULTS + fused_file[0].fused_file, STATS_MAXBYTES)
            except ValueError:
                row_fused = []
        else:
            row_fused = {}
        results_list = [row_result, row_result_json, row_fused]
        for key in keys:
            for map_result in results_list:
                if key in map_result:
                    row[key] = map_result[key]
                    found[key] = True
            if key not in found:
                if key in keys_patient:
                    row[key] = row.patient[key]
                    found[key] = True
                elif key in keys_file:
                    row[key] = row.sequence_file[key]
                    found[key] = True
                else:
                    row[key] = ''

    # Re-process some data
    keys += ['IGH_av_clones']
    for row in d['query']:
        row['IGH_av_clones'] = ''
        if 'IGH_av_reads' in row:
            try:
                row['IGH_av_clones'] = '%.4f' % (1.0 / float(row['IGH_av_reads']))
                found['IGH_av_clones'] = True
            except:
                pass

    # Keep only non-empty columns
    d['stats'] = []
    for key in keys:
        if key in found:
            d['stats'] += [key]

    log.debug("patient/stats (%.3fs) %s" % (time.time()-start, request.vars["filter"]))
    return d