def transcript_stats(value, args): if not value: return print('Generating transcripts statistics') loc = args.location filename = loc + 'transcript_variation.txt.gz' all = float(count_lines(filename)) enst = count_lines(filename, grep='ENST') lrg = count_lines(filename, grep='LRG_') print('All transcripts: %s ' % all) print('ENST transcripts: %s (%s percent)' % (enst, enst / all)) print('LRG transcripts: %s (%s percent)' % (lrg, lrg / all)) print('Other transcripts: %s (%s percent)' % (all - enst - lrg, (all - lrg - enst) / all))
def load_transcript_strands(loc): transcript_strand = {} filename = loc + 'transcript.txt.gz' with fast_gzip_read(filename) as f: for line in tqdm(f, total=count_lines(filename)): data = line.split('\t') transcript_strand[data[14]] = int(data[6]) return transcript_strand
def load_chromosome_and_region_names(loc): seq_region = {} # seq_region_id, name, cord_system_fk filename = loc + 'seq_region.txt.gz' with fast_gzip_read(filename) as f: for line in tqdm(f, total=count_lines(filename)): data = line.split('\t') seq_region[int(data[0])] = data[1] return seq_region
def load_variation_sources(loc): sources = {} # 'source_id', 'name', 'version', 'description', 'url', 'type', 'somatic_status', 'data_types' filename = loc + 'source.txt.gz' with fast_gzip_read(filename) as f: for line in tqdm(f, total=count_lines(filename)): data = line.split('\t') sources[int(data[0])] = data[1] gc.collect() return sources
def count_spidex(): from multiprocess import count_lines return count_lines(SPIDEX_LOCATION)
def count_all(tissues_list, path=DEFAULT_PATH, suffix=DEFAULT_SUFFIX): return sum( count_lines(path) for path in expression_file_paths(tissues_list, path, suffix))