def get_analysis_info(json_paths, sample_set_id): '''Return the information in the analysis files for the given patient under the provided json paths. ''' analysis_file = get_analysis_from_sample_set(sample_set_id) results = [] for analysis in analysis_file: filename=defs.DIR_RESULTS + analysis.analysis_file results.append(vidjil_utils.extract_fields_from_json(json_paths, None, filename)) return results
def get_analysis_info(json_paths, sample_set_id): """Return the information in the analysis files for the given patient under the provided json paths. """ analysis_file = get_analysis_from_sample_set(sample_set_id) results = [] for analysis in analysis_file: filename = defs.DIR_RESULTS + analysis.analysis_file results.append(vidjil_utils.extract_fields_from_json(json_paths, None, filename)) return results
def stats(): start = time.time() d = custom() stats_regex = [ # found 771265 40-windows in 2620561 segments (85.4%) inside 3068713 sequences # before 1f501e13 (-> 2015.05) 'in (?P<seg>\d+) segments \((?P<seg_ratio>.*?)\) inside (?P<reads>\d+) sequences', # found 10750 50-windows in 13139 reads (99.9% of 13153 reads) 'windows in (?P<seg>\d+) reads \((?P<seg_ratio>.*?) of (?P<reads>\d+) reads\)', # segmentation causes 'log.* SEG_[+].*?-> (?P<SEG_plus>.*?).n', 'log.* SEG_[-].*?-> (?P<SEG_minus>.*?).n', ] # stats by locus for locus in defs.LOCUS: locus_regex = locus.replace('+', '[+]') locus_group = locus.replace('+', 'p') stats_regex += [ 'log.* %(locus)s.*?->\s*?(?P<%(locus_g)s_reads>\d+)\s+(?P<%(locus_g)s_av_len>[0-9.]+)\s+(?P<%(locus_g)s_clones>\d+)\s+(?P<%(locus_g)s_av_reads>[0-9.]+)\s*.n' % { 'locus': locus_regex, 'locus_g': locus_group } ] json_paths = { 'result_file': { 'main_clone': '/clones[0]/name', 'main_clone_reads': '/clones[0]/reads[0]' }, 'fused_file': { 'reads distribution [>= 10%]': 'reads/distribution/0.1', 'reads distribution [>= 1% < 10%]': 'reads/distribution/0.01', 'reads distribution [>= .01% < 1%]': 'reads/distribution/0.001', 'reads distribution [>= .001% < .01%]': 'reads/distribution/0.0001', 'reads distribution [>= .0001% < .001%]': 'reads/distribution/0.00001', 'producer': 'samples/producer' } } keys_patient = ['info'] keys_file = ['sampling_date', 'size_file'] keys = [] keys += keys_file keys += keys_patient regex = [] for sr in stats_regex: r = re.compile(sr) regex += [r] keys += r.groupindex.keys() keys += sorted(json_paths['result_file'].keys() + json_paths['fused_file'].keys()) for row in d['query']: found = {} results_f = row.results_file.data_file row_result = vidjil_utils.search_first_regex_in_file( regex, defs.DIR_RESULTS + results_f, STATS_READLINES) try: row_result_json = vidjil_utils.extract_fields_from_json( json_paths['result_file'], None, defs.DIR_RESULTS + results_f, STATS_MAXBYTES) except: row_result_json = [] fused_file = db((db.fused_file.sample_set_id == row.sample_set.id) & ( db.fused_file.config_id == row.results_file.config_id)).select( orderby=~db.fused_file.id, limitby=(0, 1)) if len(fused_file ) > 0 and fused_file[0].sequence_file_list is not None: sequence_file_list = fused_file[0].sequence_file_list.split('_') try: pos_in_list = sequence_file_list.index( str(row.sequence_file.id)) row_fused = vidjil_utils.extract_fields_from_json( json_paths['fused_file'], pos_in_list, defs.DIR_RESULTS + fused_file[0].fused_file, STATS_MAXBYTES) except ValueError: row_fused = [] else: row_fused = {} results_list = [row_result, row_result_json, row_fused] for key in keys: for map_result in results_list: if key in map_result: row[key] = map_result[key] found[key] = True if key not in found: if key in keys_patient: row[key] = row.patient[key] found[key] = True elif key in keys_file: row[key] = row.sequence_file[key] found[key] = True else: row[key] = '' # Re-process some data keys += ['IGH_av_clones'] for row in d['query']: row['IGH_av_clones'] = '' if 'IGH_av_reads' in row: try: row['IGH_av_clones'] = '%.4f' % (1.0 / float(row['IGH_av_reads'])) found['IGH_av_clones'] = True except: pass # Keep only non-empty columns d['stats'] = [] for key in keys: if key in found: d['stats'] += [key] log.debug("patient/stats (%.3fs) %s" % (time.time() - start, request.vars["filter"])) return d
def stats(): start = time.time() d = custom() stats_regex = [ # found 771265 40-windows in 2620561 segments (85.4%) inside 3068713 sequences # before 1f501e13 (-> 2015.05) 'in (?P<seg>\d+) segments \((?P<seg_ratio>.*?)\) inside (?P<reads>\d+) sequences', # found 10750 50-windows in 13139 reads (99.9% of 13153 reads) 'windows in (?P<seg>\d+) reads \((?P<seg_ratio>.*?) of (?P<reads>\d+) reads\)', # segmentation causes 'log.* SEG_[+].*?-> (?P<SEG_plus>.*?).n', 'log.* SEG_[-].*?-> (?P<SEG_minus>.*?).n', ] # stats by locus for locus in defs.LOCUS: locus_regex = locus.replace('+', '[+]') locus_group = locus.replace('+', 'p') stats_regex += [ 'log.* %(locus)s.*?->\s*?(?P<%(locus_g)s_reads>\d+)\s+(?P<%(locus_g)s_av_len>[0-9.]+)\s+(?P<%(locus_g)s_clones>\d+)\s+(?P<%(locus_g)s_av_reads>[0-9.]+)\s*.n' % { 'locus': locus_regex, 'locus_g': locus_group } ] json_paths = { 'result_file': { 'main_clone': '/clones[0]/name', 'main_clone_reads': '/clones[0]/reads[0]' }, 'fused_file': { 'reads distribution [>= 10%]': 'reads/distribution/0.1', 'reads distribution [>= 1% < 10%]': 'reads/distribution/0.01', 'reads distribution [>= .01% < 1%]': 'reads/distribution/0.001', 'reads distribution [>= .001% < .01%]': 'reads/distribution/0.0001', 'reads distribution [>= .0001% < .001%]': 'reads/distribution/0.00001', 'producer': 'samples/producer' } } keys_patient = [ 'info' ] keys_file = [ 'sampling_date', 'size_file' ] keys = [] keys += keys_file keys += keys_patient regex = [] for sr in stats_regex: r = re.compile(sr) regex += [r] keys += r.groupindex.keys() keys += sorted(json_paths['result_file'].keys() + json_paths['fused_file'].keys()) for row in d['query']: found = {} results_f = row.results_file.data_file row_result = vidjil_utils.search_first_regex_in_file(regex, defs.DIR_RESULTS + results_f, STATS_READLINES) try: row_result_json = vidjil_utils.extract_fields_from_json(json_paths['result_file'], None, defs.DIR_RESULTS + results_f, STATS_MAXBYTES) except: row_result_json = [] fused_file = db((db.fused_file.sample_set_id == row.sample_set.id) & (db.fused_file.config_id == row.results_file.config_id)).select(orderby = ~db.fused_file.id, limitby=(0,1)) if len(fused_file) > 0 and fused_file[0].sequence_file_list is not None: sequence_file_list = fused_file[0].sequence_file_list.split('_') try: pos_in_list = sequence_file_list.index(str(row.sequence_file.id)) row_fused = vidjil_utils.extract_fields_from_json(json_paths['fused_file'], pos_in_list, defs.DIR_RESULTS + fused_file[0].fused_file, STATS_MAXBYTES) except ValueError: row_fused = [] else: row_fused = {} results_list = [row_result, row_result_json, row_fused] for key in keys: for map_result in results_list: if key in map_result: row[key] = map_result[key] found[key] = True if key not in found: if key in keys_patient: row[key] = row.patient[key] found[key] = True elif key in keys_file: row[key] = row.sequence_file[key] found[key] = True else: row[key] = '' # Re-process some data keys += ['IGH_av_clones'] for row in d['query']: row['IGH_av_clones'] = '' if 'IGH_av_reads' in row: try: row['IGH_av_clones'] = '%.4f' % (1.0 / float(row['IGH_av_reads'])) found['IGH_av_clones'] = True except: pass # Keep only non-empty columns d['stats'] = [] for key in keys: if key in found: d['stats'] += [key] log.debug("patient/stats (%.3fs) %s" % (time.time()-start, request.vars["filter"])) return d