def reports(): # Begin report generation groups = dfd.groupby('var_type') from plot import plot_dists for name, group in groups: plot_dists(group.sstop - group.sstart, name, rpath) generate_report(report_dict) type_count = dfd.groupby('var_type').agg(lambda x: x.shape[0]).loc[:, ['chr']] var_percent = type_count.ix[:,0]/float(dfd.shape[0])*100 type_count['var_percent'] = var_percent print(type_count) report_dict['type_counts'] = type_count.to_html()
def main(): config = ConfigParser.RawConfigParser() config.read(sys.argv[1]) gpath = config.get('input', 'make_ref') size_limit = config.getfloat('params', 'max_size') files = glob.glob(gpath + "tab/*.txt") studies_include = config.get('params', 'studies_include') studies_exclude = config.get('params', 'studies_exclude').split(",") vartype_f = config.get('params', 'var_type') if studies_include == '' or studies_include == None: studies_include = [] else: studies_include = studies_include.split(",") filtered = [] start = timeit.default_timer() pool = mp.Pool(8) files = files[0:20] studies = [i.split("/")[-1].rstrip(".txt") for i in files] for i in files: study = i.split("/")[-1].rstrip(".txt") if study in studies_exclude: pass else: if (len(studies_include) == 0) or (study in studies_include): reader = pd.read_csv(i, sep="\t", index_col=0, dtype={'chr': 'S5'}) pool.apply_async(filter_by_size, [reader, study], {'max_size': size_limit}, callback=lambda x: filtered.append(x)) else: pass # Remove duplicated elements ###### Step takes around 7 minutes ################### pool.close() pool.join() df = pd.concat(filtered) print(vartype_f) stop = timeit.default_timer() print('Time to load in files and parse: {0!s}'.format(stop - start)) p_studies = set(df.study) non_passed = [] for i in studies: if i not in p_studies: non_passed.append(i) print(('Studies that had no variants that did ' 'not pass size filtering:{0}').format("\t".join(non_passed))) ############## HACK for now until we find out what is going on # # Get rid of the contigs for now df = df.ix[df.contig.isnull(), :] # The GRc37 to 38 multiple mapping isn't resolved need to discuss how to # deal with this df = df.ix[np.logical_not(df.index.duplicated()), :] # :TODO if sstart and sstop are the same, no # matter if it was originally annotated as inner_start # or inner stop it will be collapsed # For now since, ignore fuzzy dfd = df.drop_duplicates(['chr', 'var_type', 'sstart', 'sstop'], inplace=False) new_unique_index = np.arange(dfd.shape[0]) dfd.loc[:, 'uID'] = new_unique_index print('new index created') # This step takes forever start = timeit.default_timer() groups = df.groupby('chr') unique_mapping = [] pool = mp.Pool(8) for name, group in groups: pool.apply_async(generate_unique_mapping, args=(dfd.ix[dfd.chr == name, :], group), callback=lambda x: unique_mapping.append(x)) ''' tgroup = dfd.ix[dfd['chr'] == name,] pool.apply_async(generate_unique_mapping_numba, args = (group.sstart.values, group.sstop.values, tgroup.sstart.values, tgroup.sstop.values, tgroup.index.values), callback=lambda x: unique_mapping.append(pd.Series(x, index = group.index))) ''' pool.close() pool.join() ns = pd.concat(unique_mapping) print('Time to generate mapping: {0!s}'.format((stop - start))) df['uID'] = ns report_dict = {} nstudies = config.getint('params', 'nstudies') start = timeit.default_timer() output = np.zeros(dfd.uID.shape[0], dtype=bool) embed() std_filter = groupby_study_numba(df.uID.values, df.study, output, nstudies=nstudies) print(np.sum(std_filter)) dfd = dfd.ix[std_filter, :] df = df.ix[df.uID.isin(dfd.uID), :] dfd.to_csv(gpath + 'filtered_no_dupes.txt', sep="\t") df.to_csv(gpath + 'study_filtered_all.txt', sep="\t") print('Time to run: {0!s}'.format(stop - start)) groups = dfd.groupby('var_type') from plot import plot_dists generate_report(report_dict) rpath = config.get('output', 'report_dir') for name, group in groups: plot_dists(group.sstop - group.sstart, name, rpath) type_count = dfd.groupby('var_type').agg(lambda x: x.shape[0]).loc[:, ['chr']] var_percent = type_count.ix[:, 0] / float(dfd.shape[0]) * 100 type_count['var_percent'] = var_percent type_count['var_percent'].round(2) report_dict['var_type_pivot'] = type_count.to_html() report_dict['studies'] = [] report_dict['var_types'] = [name for name, _ in groups] generate_report(report_dict)
def main(): config = ConfigParser.RawConfigParser() config.read(sys.argv[1]) gpath = config.get('input', 'make_ref') size_limit = config.getfloat('params', 'max_size') files = glob.glob(gpath + "tab/*.txt") studies_include = config.get('params', 'studies_include') studies_exclude = config.get('params', 'studies_exclude').split(",") vartype_f = config.get('params', 'var_type') if studies_include == '' or studies_include == None: studies_include = [] else: studies_include = studies_include.split(",") filtered = [] start = timeit.default_timer() pool = mp.Pool(8) files = files[0:20] studies = [i.split("/")[-1].rstrip(".txt") for i in files] for i in files: study = i.split("/")[-1].rstrip(".txt") if study in studies_exclude: pass else: if (len(studies_include) == 0) or (study in studies_include): reader = pd.read_csv(i, sep="\t", index_col=0, dtype={'chr':'S5'}) pool.apply_async(filter_by_size, [reader, study], {'max_size': size_limit}, callback = lambda x: filtered.append(x)) else: pass # Remove duplicated elements ###### Step takes around 7 minutes ################### pool.close() pool.join() df = pd.concat(filtered) print(vartype_f) stop = timeit.default_timer() print('Time to load in files and parse: {0!s}'.format(stop-start)) p_studies = set(df.study) non_passed = [] for i in studies: if i not in p_studies: non_passed.append(i) print(('Studies that had no variants that did ' 'not pass size filtering:{0}').format("\t".join(non_passed))) ############## HACK for now until we find out what is going on # # Get rid of the contigs for now df = df.ix[df.contig.isnull(), :] # The GRc37 to 38 multiple mapping isn't resolved need to discuss how to # deal with this df = df.ix[np.logical_not(df.index.duplicated()),:] # :TODO if sstart and sstop are the same, no # matter if it was originally annotated as inner_start # or inner stop it will be collapsed # For now since, ignore fuzzy dfd = df.drop_duplicates(['chr', 'var_type', 'sstart', 'sstop'], inplace=False) new_unique_index = np.arange(dfd.shape[0]) dfd.loc[:,'uID'] = new_unique_index print('new index created') # This step takes forever start = timeit.default_timer() groups = df.groupby('chr') unique_mapping = [] pool = mp.Pool(8) for name, group in groups: pool.apply_async(generate_unique_mapping, args = (dfd.ix[dfd.chr == name,:], group), callback=lambda x: unique_mapping.append(x)) ''' tgroup = dfd.ix[dfd['chr'] == name,] pool.apply_async(generate_unique_mapping_numba, args = (group.sstart.values, group.sstop.values, tgroup.sstart.values, tgroup.sstop.values, tgroup.index.values), callback=lambda x: unique_mapping.append(pd.Series(x, index = group.index))) ''' pool.close() pool.join() ns = pd.concat(unique_mapping) print('Time to generate mapping: {0!s}'.format((stop-start))) df['uID'] = ns report_dict = {} nstudies = config.getint('params', 'nstudies') start = timeit.default_timer() output = np.zeros(dfd.uID.shape[0], dtype=bool) embed() std_filter = groupby_study_numba(df.uID.values, df.study, output, nstudies=nstudies) print(np.sum(std_filter)) dfd = dfd.ix[std_filter,:] df = df.ix[df.uID.isin(dfd.uID),:] dfd.to_csv(gpath + 'filtered_no_dupes.txt', sep="\t") df.to_csv(gpath + 'study_filtered_all.txt', sep="\t") print('Time to run: {0!s}'.format(stop - start)) groups = dfd.groupby('var_type') from plot import plot_dists generate_report(report_dict) rpath = config.get('output', 'report_dir') for name, group in groups: plot_dists(group.sstop - group.sstart, name, rpath) type_count = dfd.groupby('var_type').agg(lambda x: x.shape[0]).loc[:, ['chr']] var_percent = type_count.ix[:,0]/float(dfd.shape[0])*100 type_count['var_percent'] = var_percent type_count['var_percent'].round(2) report_dict['var_type_pivot'] = type_count.to_html() report_dict['studies'] = [] report_dict['var_types'] = [name for name, _ in groups] generate_report(report_dict)