def main(job_no, coord_name, start, end, species, release, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) ig_count, sequence_length = 0, 0 genome = Genome(species, release=release, account=account, pool_recycle=3600) gene_count = 0 gene_intervals = list() genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types='gene') for gene in genes: if gene.location.coord_name != coord_name: break gene_count += 1 gene_intervals.append((gene.location.start, gene.location.end)) gene_intervals = sorted(gene_intervals, key=lambda x: x[1]) intergenic = interval_complement(gene_intervals) intergenic_sequence = "" for ig_interval in intergenic: ig_count += 1 sequence_length += ig_interval[1] - ig_interval[0] region = genome.get_region(coord_name=coord_name, start=ig_interval[0], end=ig_interval[1]) intergenic_sequence = intergenic_sequence + 'XXXXXXXXXX' + str( region.seq) LOGGER.log_message( str(ig_count), label='Number of integenic intervals processed'.ljust(30)) LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30)) outfile_name = dir + '/intergenic_sequence_' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(intergenic_sequence, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, coord_name, start, end, species, release, folder): start_time = time() if not os.path.exists(folder): os.makedirs(folder) LOGGER.log_file_path = folder + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) dupl_introns, intron_count, sequence_length = 0, 0, 0 intron_list, human_list, species_list, intron_list = list(), list(), list( ), list() genome = Genome(species, release=release, account=account, pool_recycle=3600) genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types='gene') intron_sequence = 'X' for gene in genes: if gene.canonical_transcript.introns is None: continue for intron in gene.canonical_transcript.introns: if intron in intron_list: dupl_introns += 1 continue intron_list.append(intron) intron_count += 1 sequence_length += len(intron) intron_sequence = intron_sequence + 'XXXXXXXXXX' + str(intron.seq) outfile_name = folder + '/intronic_sequence' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(intron_sequence, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() LOGGER.log_message(str(dupl_introns), label='Number of duplicate introns rejected'.ljust(30)) LOGGER.log_message(str(intron_count), label='Number of introns processed'.ljust(30)) LOGGER.log_message(str(sequence_length), label='Total intron_length'.ljust(30)) duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, infile_root, suffixes, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module".ljust(30)) file_suffixes = suffixes.split(',') total_dict = defaultdict(int) counts = list() for c in file_suffixes: filename = dir + '/' + infile_root + c + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as chrdict: chrdict = pickle.load(chrdict) counts.append([c, sum(chrdict.values())]) for k in chrdict.keys(): total_dict[k] += chrdict[k] counts = pd.DataFrame.from_records(counts) outfile_name = dir + '/' + 'merged_context_data_' + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(total_dict, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() fname = dir + '/' + 'merge_counts_' + job_no + '.csv' counts.to_csv(fname) outfile = open(fname, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="Run duration (minutes)".ljust(50))
def test_read_from_written(): """create files with different line endings dynamically""" text = "abcdeENDedfguENDyhbnd" with TemporaryDirectory(dir=TEST_ROOTDIR) as dirname: for ex, lf in ( ("f06597f8a983dfc93744192b505a8af9", "\n"), ("39db5cc2f7749f02e0c712a3ece12ffc", "\r\n"), ): p = Path(dirname) / "test.txt" data = text.replace("END", lf) p.write_bytes(data.encode("utf-8")) expect = get_text_hexdigest(data) assert expect == ex, (expect, ex) got = get_file_hexdigest(p) assert got == expect, f"FAILED: {repr(lf)}, {(ex, got)}"
def main(job_no, folder, fname, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str(os.path.basename(__file__)) + job_no + ".log" # change LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module ") LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module ") csv_filename = '/male_noncarrier.rmap' male = pd.read_csv(folder + csv_filename, sep='\t') male = male.sort_values(['chr', 'pos']) csv_filename = '/female_noncarrier.rmap' female = pd.read_csv(folder + csv_filename, sep='\t') female = female.sort_values(['chr', 'pos']) for chrom in range(1, 23): chrom = str(chrom) m = male[male['chr'] == 'chr' + chrom] f = female[female['chr'] == 'chr' + chrom] csv_name = dir + '/' + fname + chrom + '.csv' infile = open(csv_name, 'r') LOGGER.input_file(infile.name) infile.close() table = pd.read_csv(csv_name, sep=',', index_col=0) #Note we are actually matching on hg37 coordinates used by deCODE xtable = table.merge(m, 'left', ['chr', 'pos'], sort=True, suffixes=['_sexav', '_male'], \ indicator='indicm', validate='1:1') xtable.rename(columns={"stdrate_sexav": "stdrate", "seqbin_sexav": "seqbin"}, inplace=True) xtable = xtable.merge(f, 'left', ['chr', 'pos'], sort=True, suffixes=['_sexav', '_female'], \ indicator='indicf', validate='1:1') assert np.all(xtable['indicm'] == 'both'), 'Merge error with male.' assert np.all(xtable['indicf'] == 'both'), 'Merge error with female.' csv_filename = 'Recombination_data/recomb_table_all_sexes_ch' + chrom + '.csv' xtable.to_csv(csv_filename, sep=',') outfile = open(csv_filename, 'r') LOGGER.output_file(outfile.name) outfile.close() print(xtable.head()) duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main_core(job_no, filename=None, n_jobs=5, context_size=9, dir='data'): #global sequence if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" start_time = time() LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(30)) LOGGER.log_message('Name = ' + re.__name__ + ', version = ' + re.__version__, label="Imported module".ljust(30)) infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as sequence: # i.e. intron sequence sequence = pickle.load(sequence) context_size = int(context_size) contexts_generator = itertools.product('ACGT', repeat=context_size) contexts = tuple(''.join(context) for context in contexts_generator) concounts = list() for context in contexts: concount = count_single_context(context, sequence) concounts.append(concount) #concounts = Parallel(n_jobs=n_jobs)(delayed(count_single_context)(context) for context in contexts) context_dict = dict(zip(contexts, concounts)) outfile_name = dir + '/context_dict_' + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(context_dict, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() LOGGER.log_message(str(len(context_dict.keys())), label="Number of dictionary keys".ljust(30)) LOGGER.log_message(str(sum(context_dict.values())), label="Count of contexts".ljust(30)) duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, coord_name, start, end, species, release, var_set_id, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome(species, release=release, account=account, pool_recycle=3600) confirm_variation_set(genome, var_set_id) var_locations = get_variant_details(genome, coord_name, start, end) LOGGER.log_message(str(len(var_locations)), label='Length of var_locations list'.ljust(30)) outfile_name = dir + '/intergenic_variants_' + species + '_' + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(var_locations, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, sex, chroms, rank, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + job_no + ".log" # change LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module ") LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module ") LOGGER.log_message('Name = ' + scipy.__name__ + ', version = ' + scipy.__version__, label="Imported module ") LOGGER.log_message('Name = ' + statsmodels.__name__ + ', version = ' + statsmodels.__version__, label="Imported module ") LOGGER.log_message('Name = ' + sklearn.__name__ + ', version = ' + sklearn.__version__, label="Imported module ") pd.set_option('display.max_columns', None) if chroms is None: chroms = np.arange(1, 23).astype(str).tolist() else: chroms = chroms.split(',') if rank: LOGGER.log_message("%1d" % rank, label="Rank of model to select (best=0).".ljust(30)) for chrom in chroms: csv_name = dir + '/recomb_table_all_sexes_ch' + chrom + '.csv' infile = open(csv_name, 'r') LOGGER.input_file(infile.name) infile.close() data_table = pd.read_csv(csv_name, sep=',', index_col=0) data_table = recombination.correct_missing_data( data_table, 'LOCF', sex) std_col = 'stdrate_' + sex std_rates = data_table[std_col].values variants_profiled = data_table.iloc[:, np.arange(5, 17)] variant_counts = variants_profiled.sum(axis=1) var_rates = variant_counts / 10000 print('\n\nChromosome number = ' + chrom) print('Avge. mutation rate = ', np.mean(var_rates)) xvals = std_rates.reshape(-1, 1) lmodel = LinearRegression() lmodel.fit(xvals, var_rates) residuals = var_rates - lmodel.predict(xvals) sys.stdout.flush() print('Slope, intercept, mean of residuals = ', '%.8f' % lmodel.coef_[0], '%.8f' % lmodel.intercept_, '%.12f' % np.mean(residuals)) orders = recombination.evaluate_ARMA_models(residuals, 10, 4) best_order = orders[rank] best_mdl = smt.ARMA(residuals, order=best_order).fit(method='mle', trend='nc', disp=0) print(best_mdl.summary()) outfile_name = dir + '/ARMA_model_ch' + chrom + '_' + job_no + '.pklz' recombination.save_model_details(best_mdl, outfile_name) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main_core(job_no, species, varfile_name=None, intronfile_name=None, release=89, n_jobs=5, dir='data'): global genome if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str(os.path.basename(__file__)) + '_' + job_no + ".log" start_time = time() LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(25)) LOGGER.log_message('Name = ' + numpy.__name__ + ', version = ' + numpy.__version__, label="Imported module".ljust(25)) LOGGER.log_message('Name = ' + cogent3.__name__ + ', version = ' + cogent3.__version__, label="Imported module".ljust(25)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(25)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome(species, release=release, account=account, pool_recycle=3600) human_seq_region_dict = dict( {'1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559, '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541, '14': 131547, '15': 131558, '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543, '22': 131557, 'X': 131539, 'Y': 131553}) chimp_seq_region_dict = dict({"21": 212405, "7": 212407, "15": 212409, "16": 212395, "1": 212403, "17": 212411, "18": 212410, "19": 212394, "20": 212404, "22": 212390, "3": 212392, "4": 212393, "5": 212391, "6": 212388, "8": 212397, "9": 212396, "10": 212387, "11": 212389, "12": 212402, "13": 212408, "14": 212401, "Y": 212406, "X": 212399}) if species == 'human': coord_dict = dict([(v, k) for k, v in human_seq_region_dict.items()]) tag = 'human' elif species == 'chimp': coord_dict = dict([(v, k) for k, v in chimp_seq_region_dict.items()]) tag = 'spec_' else: assert False, 'Unknown species: ' + species if varfile_name is None: varfile_name = dir + '/var_locations_' + tag + job_no + '.pklz' infile = open(varfile_name, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(varfile_name, 'rb') as var_details: var_details = pickle.load(var_details) LOGGER.log_message(str(len(var_details)), label="Number of variants read".ljust(25)) if intronfile_name is None: intronfile_name = dir + '/all_locations_' + tag + job_no + '.pklz' infile = open(intronfile_name, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(intronfile_name, 'rb') as intron_locs: intron_locs = pickle.load(intron_locs) LOGGER.log_message(str(len(intron_locs)), label="Number of introns read".ljust(25)) with gzip.open(intronfile_name, 'rb') as intron_locs: intron_locs = pickle.load(intron_locs) var_details, var_locs_reversed = check_variant_strand(var_details, intron_locs) # var_details fields are: (variant name, seq region id, location, ancestral_allele, derived_allele) item_list = Parallel(n_jobs=n_jobs)(delayed(get_contexts) (var, coord_dict) for var in var_details) var_count_dict = Counter(item_list) del var_count_dict[None] outfile_name = dir + '/var_dict_' + tag + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(var_count_dict, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(25))
def main(job_no, coord_name, start, end, species, release, var_set_id, filter, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) var_locations_list, location_list = list(), list() dupl_introns, intron_count, bad_var_count, sequence_length = 0, 0, 0, 0 intron_list, human_list, species_list, intron_list = list(), list(), list( ), list() genome = Genome(species, release=release, account=account, pool_recycle=3600) confirm_variation_set(genome, var_set_id) genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types='gene') for gene in genes: if gene.canonical_transcript.introns is None: continue for intron in gene.canonical_transcript.introns: if intron in intron_list: dupl_introns += 1 continue intron_list.append(intron) intron_length = len(intron) intron_count += 1 sequence_length += intron_length loc = intron.location location_list.append( (str(loc.coord_name), loc.start, loc.end, loc.strand)) # location.coord_name is db3util object var_locations, bad_var_num = get_variant_details( genome, species, intron, filter) var_locations_list = var_locations_list + var_locations bad_var_count += bad_var_num LOGGER.log_message(str(dupl_introns), label='Number of duplicate introns rejected'.ljust(30)) LOGGER.log_message(str(intron_count), label='Number of introns processed'.ljust(30)) if species == 'human': LOGGER.log_message(str(bad_var_count), label='Number of rejected variants'.ljust(30)) LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30)) LOGGER.log_message(str(len(var_locations_list)), label='Length of var_locations list'.ljust(30)) LOGGER.log_message(str(len(var_locations_list) / sequence_length), label='Average SNV rate'.ljust(30)) outfile_name = dir + '/var_locations_' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(var_locations_list, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() outfile_name = dir + '/all_locations_' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(location_list, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, chroms, draws, sex, suffixes, folder): start_time = time() if not os.path.exists(folder): os.makedirs(folder) LOGGER.log_file_path = folder + "/" + str(os.path.basename(__file__)) + job_no + ".log" # change LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module ") LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module ") LOGGER.log_message('Name = ' + theano.__name__ + ', version = ' + theano.__version__, label="Imported module ") LOGGER.log_message('Name = ' + pymc3.__name__ + ', version = ' + pymc3.__version__, label="Imported module ") #Mutation rates per chromosome are calculated from data at Jonsson et al. Parental influence on human # germline de novomutations in 1,548 trios from Iceland. ('Estimate mutation rates from Jonsson data.ipynb) mrates = [1.1045541764661985e-08, 1.2481509352581898e-08, 1.254443516411994e-08, 1.2609734521720365e-08, 1.216379148788216e-08, 1.2228991967962778e-08, 1.2298304077726808e-08, 1.3325693328599174e-08, 1.0711369887343474e-08, 1.238059175011868e-08, 1.2241940318060874e-08, 1.2117457093135447e-08, 1.0174746106096945e-08, 1.0146311894484388e-08, 1.0516600482736078e-08, 1.2597261162425896e-08, 1.1681529656302903e-08, 1.1855256275211491e-08, 1.214570124735936e-08, 1.1756514975959873e-08, 8.965863348091259e-09, 9.024242643357694e-09] result_rows = list() columns = ['snvdens', 'p', 'q', 'alpha', 'alpha25', 'alpha975', 'beta', 'beta25', 'beta975', 'slopem', 'slopem25', 'slopem75', 'pval', 'r2', 'variance', 'variance25', 'variance975', 'mutprop', 'mutprop25', 'mutprop975', 'mutperco', 'mutperco25', 'mutperco975'] if chroms is None: chroms = np.arange(1, 23).astype(str).tolist() else: chroms = chroms.split(',') for chromplace, chrom in enumerate(chroms): print(chromplace, 'Chromosome ', chrom) if suffixes is None: suffixes = len(chroms) * [""] else: suffixes = suffixes.split(',') results = list() for i, chrom in enumerate(chroms): csv_filename = folder + '/recomb_table_all_sexes_ch' + chrom + '.csv' infile = open(csv_filename, 'r') LOGGER.input_file(infile.name) infile.close() data_table = pd.read_csv(csv_filename, sep=',', index_col=0) data_table = recombination.correct_missing_data(data_table, 'LOCF', sex) variants_profiled = data_table.iloc[:, np.arange(5, 17)] variant_counts = variants_profiled.sum(axis=1) var_rates = variant_counts / 10000 std_col = 'stdrate_' + sex std_rates = data_table[std_col].values print('Avge. & var. of mutation rate ', np.mean(var_rates), np.var(var_rates)) suffix = suffixes[i] file_name = folder + '/ARMA_model_ch' + str(chrom) + suffix + '.pklz' infile = open(file_name, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(file_name, 'rb') as model_details: model_details = pickle.load(model_details) trace = recombination.run_MCMC_ARMApq(std_rates, var_rates, draws, model_details) p = model_details['order'][0] q = model_details['order'][1] neg_slope = sum(t['beta'] <= 0 for t in trace) print('Chrom', chrom, 'variates with slope <=0 ', neg_slope) print('Chrom', chrom, 'probability slope <=0 = ', neg_slope / draws) ss_tot = np.var(var_rates) ss_res_vars = [np.var(var_rates - t['alpha'] - t['beta'] * std_rates) for t in trace] r2_vars = 1 - (ss_res_vars / ss_tot) variance_variates = ss_tot - ss_res_vars vmean = np.mean(variance_variates) vlow = np.percentile(variance_variates, 2.5) vhigh = np.percentile(variance_variates, 97.5) chr_results = pd.DataFrame(variance_variates, columns=['vars']) chr_results['chr'] = chrom results.append(chr_results) snv_dens = np.mean(var_rates) intercept_mean = np.mean([t['alpha'] for t in trace]) intercept_CI_low = np.percentile([t['alpha'] for t in trace], 2.5) intercept_CI_high = np.percentile([t['alpha'] for t in trace], 97.5) rfunc = lambda x: (snv_dens - x) / snv_dens mfunc = lambda x: x * mutation_rate / (snv_dens * 0.0116) print('Proportion muts due to recomb = ', rfunc(intercept_mean), 'CIs = ', rfunc(intercept_CI_high), rfunc(intercept_CI_low)) recomb_rate = np.mean(std_rates) * 0.0116 / (100 * 1e4) mutation_rate = mrates[i] mutsper = (rfunc(intercept_mean) * mutation_rate) / recomb_rate print('Mutations per CO = ', mutsper) sys.stdout.flush() s = summary(trace, varnames=['alpha', 'beta']) result_row = [np.mean(var_rates), p, q, s.loc['alpha', 'mean'], s.loc['alpha', 'hpd_2.5'], s.loc['alpha', 'hpd_97.5'], s.loc['beta', 'mean'], s.loc['beta', 'hpd_2.5'], s.loc['beta', 'hpd_97.5'], mfunc(s.loc['beta', 'mean']), mfunc(s.loc['beta', 'hpd_2.5']), mfunc(s.loc['beta', 'hpd_97.5']), neg_slope / draws, np.mean(r2_vars), vmean, vlow, vhigh, rfunc(intercept_mean), rfunc(intercept_CI_low), rfunc(intercept_CI_high), mutsper, (rfunc(intercept_CI_high) * mutation_rate) / recomb_rate, (rfunc(intercept_CI_low) * mutation_rate) / recomb_rate] result_rows.append(result_row) results_table = pd.DataFrame(result_rows, columns=columns) outfile_name = folder + '/ARMApq_results_' + job_no + '.csv' results_table.to_csv(outfile_name) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() results = pd.concat(results) outfile_name = folder + '/ARMApq_variates_' + sex + '_'+ job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(results, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, suffix, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + statsmodels.__name__ + ', version = ' + statsmodels.__version__, label="Imported module".ljust(30)) result = pd.DataFrame( columns=['kmer', 'variance', 'Marginalise over central base?']) #Find variance due to CpG filename = dir + '/var_counts_1' + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as var_counts: var_counts = pickle.load(var_counts) filename = dir + '/context_counts_1' + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as context_counts: context_counts = pickle.load(context_counts) cpg_contexts = context_counts.loc['CG', 'C'] + context_counts.loc['TG', 'C'] + context_counts.loc['AG', 'C'] + \ context_counts.loc['GG', 'C'] + \ context_counts.loc['CC', 'G'] + context_counts.loc['CT', 'G'] + context_counts.loc['CA', 'G'] + \ context_counts.loc['CG', 'G'] CpG_ratio = cpg_contexts / context_counts.values.sum() non_cpg_contexts = context_counts.values.sum() - cpg_contexts print('Total CpG sites : ', cpg_contexts) print('Total intronic sites : ', context_counts.values.sum()) print('Proportion CpG sites : ', CpG_ratio) var_counts[ 'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G'] var_counts[ 'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C'] CpG_vars = var_counts.loc['CG', 'C'] + var_counts.loc['TG', 'C'] + var_counts.loc['AG', 'C'] + \ var_counts.loc['GG', 'C'] + \ var_counts.loc['CC', 'G'] + var_counts.loc['CT', 'G'] + var_counts.loc['CA', 'G'] + \ var_counts.loc['CG', 'G'] print('Total CpG variants : ', CpG_vars) non_CpG_vars = var_counts.values.sum() - CpG_vars m1 = CpG_vars / cpg_contexts m0 = non_CpG_vars / non_cpg_contexts m_ave = var_counts.values.sum() / context_counts.values.sum() print('SNV density at CpG sites : ', m1) print('SNV density at other sites: ', m0) print('Average SNV density : ', m_ave) t1 = CpG_ratio * (m1 - m_ave)**2 t2 = (1 - CpG_ratio) * (m0 - m_ave)**2 print('Variance due to CpG sites : ', t1 + t2) LOGGER.log_message("%.2e" % (t1 + t2), label="Variance due to CpG".ljust(50)) #Deal with the 1-mer case. var_counts[ 'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G'] var_counts[ 'T'] = var_counts['T->C'] + var_counts['T->A'] + var_counts['T->G'] var_counts[ 'A'] = var_counts['A->T'] + var_counts['A->C'] + var_counts['A->G'] var_counts[ 'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C'] variant_counts = var_counts.sum(axis=0) variant_counts = variant_counts[variant_counts.index.isin( ['C', 'T', 'A', 'G'])] con_counts = context_counts.sum(axis=0) mut_rates = variant_counts / con_counts w = DescrStatsW(mut_rates, weights=con_counts, ddof=0) row = np.array([1, w.var, 'no']) row = pd.Series(row, index=result.columns, name=0) result = result.append(row) row = np.array([1, 0.0, 'yes']) row = pd.Series(row, index=result.columns, name=1) result = result.append(row) i = 2 for kmer_variable in [1, 2, 3]: filename = dir + '/var_counts_' + str(kmer_variable) + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as var_counts: var_counts = pickle.load(var_counts) filename = dir + '/context_counts_' + str( kmer_variable) + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as context_counts: context_counts = pickle.load(context_counts) #Reformat context counts by repeating columns to match snv_densities dataframe. extended_context_counts, cols = probpoly_bayes.reformat_context_counts( context_counts, var_counts) extended_context_counts.set_index(context_counts.index, inplace=True) snv_densities = var_counts / extended_context_counts #Calculate variance across mutation types, marginalising over the central base. context_ratios = context_counts.div(context_counts.sum(axis=1), axis=0) extended_context_ratios, cols = probpoly_bayes.reformat_context_counts( context_ratios, snv_densities) extended_context_ratios.set_index(context_ratios.index, inplace=True) con_weighted = (snv_densities * extended_context_ratios).sum(axis=1) u = DescrStatsW(con_weighted, weights=context_counts.sum(axis=1), ddof=0) print( 'Marginalised variance due to ' + str(2 * kmer_variable + 1) + ' -mers = ', u.var) row = np.array([2 * kmer_variable + 1, u.var, 'yes']) row = pd.Series(row, index=result.columns, name=i) result = result.append(row) i += 1 #Calculate variance conditioned on kmer, not marginalising over the central base. #Firstly we reorganise the SNV densities table so that rows correspond to kmers # (including central base) and columns correspond to the derived base. contexts_generator = product('ACGT', repeat=2 * kmer_variable + 1) contexts = tuple(''.join(context) for context in contexts_generator) kmer_densities = np.zeros((len(contexts), 4)) kmer_densities = pd.DataFrame(kmer_densities, index=contexts, columns=['C', 'T', 'A', 'G']) for context in snv_densities.index: for mut in snv_densities.columns: ref = mut[0] derived = mut[3] kmer = context[0:kmer_variable] + ref + context[ kmer_variable:2 * kmer_variable] kmer_densities.loc[kmer, derived] = snv_densities.loc[context, mut] #We also reorganise context counts into counts of kmers. kmer_counts = np.zeros((len(contexts))) kmer_counts = pd.Series(kmer_counts, index=contexts) for kmer in kmer_counts.index: context = kmer[0:kmer_variable] + kmer[kmer_variable + 1:2 * kmer_variable + 1] ref = kmer[kmer_variable] kmer_counts[kmer] = context_counts.loc[context, ref] #Calculate the weighted variance over the full kmer. v = DescrStatsW(kmer_densities.sum(axis=1), weights=kmer_counts, ddof=0) print( 'Unmarginalised variance due to ' + str(2 * kmer_variable + 1) + ' -mers = ', v.var) row = np.array([2 * kmer_variable + 1, v.var, 'no']) row = pd.Series(row, index=result.columns, name=i) result = result.append(row) i += 1 print(result) filename = dir + "/aggregated_results" + job_no + ".csv" result.to_csv(filename, sep=',') outfile = open(filename, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="Run duration (minutes)".ljust(50))
def main(job_no, var_filename, context_filename, draws, prior, nocpg, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module".ljust(30)) draws = int(draws) infile_name = var_filename infile = open(infile_name, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(infile_name, 'rb') as var_data: var_data = pickle.load(var_data) infile_name = context_filename infile = open(infile_name, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(infile_name, 'rb') as context_data: context_data = pickle.load(context_data) for kmer_variable, split in zip([1, 2, 3], [1, 4, 64]): #for kmer_variable, split in zip([4], [1024]): contexts = probpoly_bayes.unpack_all_contexts_to_dataframe( kmer_variable, context_data) duration = time() - start_time print('Unpacked all_contexts for ', kmer_variable, '-mers at', "%.2f" % (duration / 60.), 'minutes.') sys.stdout.flush() variants = probpoly_bayes.unpack_var_contexts_to_dataframe( kmer_variable, var_data) duration = time() - start_time print('Unpacked var_contexts for ', kmer_variable, '-mers at', "%.2f" % (duration / 60.), 'minutes.') sys.stdout.flush() outfile_name, dataset = dir + '/var_counts_' + str( kmer_variable) + job_no + '.pklz', variants with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(dataset, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() outfile_name, dataset = dir + '/context_counts_' + str( kmer_variable) + job_no + '.pklz', contexts with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(dataset, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() ncols = 12 contexts, columns = probpoly_bayes.reformat_context_counts( contexts, variants) w_var_samples = probpoly_bayes.calculate_variances( variants, contexts, split, prior, draws, ncols, columns) outfile_name = 'data/bayes_var_samples_' + job_no + '_k=' + str( kmer_variable) + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(w_var_samples, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() del w_var_samples duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="Run duration (minutes)".ljust(50))
def main(job_no, chrom, sex, species, release, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) human_seq_region_dict = dict({ '1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559, '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541, '14': 131547, '15': 131558, '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543, '22': 131557, 'X': 131539, 'Y': 131553 }) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome(species, release=release, account=account, pool_recycle=3600) variation_table = genome.VarDb.get_table('variation') variation_feature_table = genome.VarDb.get_table('variation_feature') var_table = variation_table.join( variation_feature_table, variation_feature_table.c.variation_id == variation_table.c.variation_id) seq_region_id = human_seq_region_dict[chrom] file_name = sex + '_noncarrier-hg38.csv' infile = open(file_name, 'r') LOGGER.input_file(infile.name) infile.close() recombination_df = pd.read_csv(file_name, usecols=[0, 1, 2, 3, 4]) recomb_df = recombination_df.loc[lambda df: df.chr == 'chr' + chrom, :] recomb_df = recomb_df.reset_index(drop=True) mut_profiles = [ i[0] + '->' + i[1] for i in permutations(['C', 'T', 'A', 'G'], 2) ] counts = np.zeros((recomb_df.shape[0], 21)) counts = pd.DataFrame(counts, columns=mut_profiles + ['C', 'T', 'A', 'G', 'SW', 'WS', 'SS', 'WW', 'CpG']) for index, row in recomb_df.iterrows(): midpoint = row.loc['pos38'] region = genome.get_region(coord_name=chrom, start=midpoint - 5000, end=midpoint + 5000, ensembl_coord=True) region = str(region.seq) whereclause1 = and_( var_table.c.variation_feature_seq_region_id == seq_region_id, var_table.c.variation_feature_class_attrib_id == 2, var_table.c.variation_feature_evidence_attribs.contains('370'), var_table.c.variation_feature_variation_name.contains('rs'), var_table.c.variation_feature_somatic == 0, var_table.c.variation_feature_alignment_quality == decimal.Decimal(1), var_table.c.variation_feature_minor_allele_freq.isnot(None), var_table.c.variation_feature_seq_region_start > midpoint - 5000, var_table.c.variation_feature_seq_region_start < midpoint + 5000) var_table_ed = var_table.select(whereclause1, use_labels=True) for snp in var_table_ed.execute(): if snp['variation_ancestral_allele'] is None: continue else: ancestral_allele = snp['variation_ancestral_allele'] alleles = snp['variation_feature_allele_string'] if fnmatch(alleles, ancestral_allele + '/?'): derived_allele = alleles[2] elif fnmatch(alleles, '?/' + ancestral_allele): derived_allele = alleles[0] else: continue mtype = ancestral_allele + '->' + derived_allele counts.loc[index, mtype] += 1 rel_loc = snp[ 'variation_feature_seq_region_start'] - midpoint + 5000 if (region[rel_loc + 1] == 'G' and ancestral_allele == 'C' and derived_allele == 'T') or \ (region[rel_loc - 1] == 'C' and ancestral_allele == 'G' and derived_allele == 'A'): counts.loc[index, 'CpG'] += 1 if ancestral_allele + derived_allele in ['CT', 'CA', 'GT', 'GA']: counts.loc[index, 'SW'] += 1 if ancestral_allele + derived_allele in ['TC', 'AC', 'TG', 'AG']: counts.loc[index, 'WS'] += 1 if ancestral_allele + derived_allele in ['CG', 'GC']: counts.loc[index, 'SS'] += 1 if ancestral_allele + derived_allele in ['TA', 'AT']: counts.loc[index, 'WW'] += 1 base_counts = Counter(region) for base in ['C', 'T', 'A', 'G']: counts.loc[index, base] = base_counts[base] results = pd.concat([recomb_df, counts], axis=1) csv_filename = 'recomb_table_SW_' + sex + '_ch' + chrom + '.csv' results.to_csv(csv_filename) outfile = open(csv_filename, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, infile_name, release, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) human_seq_region_dict = dict({ '1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559, '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541, '14': 131547, '15': 131558, '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543, '22': 131557, 'X': 131539, 'Y': 131553 }) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome('human', release=release, account=account, pool_recycle=3600) variation_feature_table = genome.VarDb.get_table('variation_feature') id_1KG = set([str(x) for x in range(42, 55)]) var_details = pd.read_csv(infile_name, sep=',', index_col=0) infile = open(infile_name, 'r') LOGGER.input_file(infile.name) infile.close() loc_count, match_count, count1KG, derived_mismatch_count = 0, 0, 0, 0 col_alleles, col_name, col_val_id = list(), list(), list() for row in var_details.iterrows(): chrom = row[1].loc['chr'] chrom = chrom[3:] seq_region_id = human_seq_region_dict[chrom] loc38 = row[1].loc['pos38'] loc_count += 1 whereclause1 = and_( variation_feature_table.c.seq_region_id == seq_region_id, variation_feature_table.c.seq_region_start == loc38, variation_feature_table.c.class_attrib_id == 2, variation_feature_table.c.variation_name.contains("rs"), variation_feature_table.c.somatic == 0, variation_feature_table.c.alignment_quality == decimal.Decimal(1), variation_feature_table.c.minor_allele_freq.isnot(None)) query = select([ variation_feature_table.c.variation_name, variation_feature_table.c.allele_string, variation_feature_table.c.variation_set_id ], whereclause1) snps = list(query.execute()) if len(snps) > 0: if len(snps) > 1: print('More than one SNP at ', chrom, ':', loc38) alleles = snps[0][1] name = snps[0][0] match_count += 1 if len(set(snps[0][2]).intersection(id_1KG)) > 0: val_id = '1KG' count1KG += 1 else: val_id = 'Other' else: val_id = 'No match' name = None alleles = None col_alleles.append(alleles) col_name.append(name) col_val_id.append(val_id) assert var_details.shape[0] == len(col_val_id), 'Column mismatch.' var_details['alleles'] = pd.Series(col_alleles) var_details['name'] = pd.Series(col_name) var_details['val_id'] = pd.Series(col_val_id) LOGGER.log_message(str(loc_count), label='Variants read = ') LOGGER.log_message(str(derived_mismatch_count), label='Derived mismatches = ') LOGGER.log_message(str(match_count), label='Variants matched = ') LOGGER.log_message(str(count1KG), label='1KG Variants = ') filename = 'data/dnms_from_PRJEB21300_matched_' + job_no + '.csv' var_details.to_csv(filename) outfile = open(filename, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))