def get_sequence_from_location(species, coords): """Get sequence from a genomic location in an ensembl species genome.""" from cogent.db.ensembl import HostAccount, Genome, Compara, Species genome = Genome(Species=species, Release='87', account=None) chrom,start,end,strand = coords #print coords r = genome.getRegion(CoordName=str(chrom), Start=start,End=end,Strand=strand) return r.Seq
def get_genes_from_location(ref, coords, pad=0): """Get genes from a set of genome coordinates. pad will add n bases to either side to expand area""" genome = Genome(Species=ref, Release=release, account=account) chrom,start,end,strand = coords genes = list(genome.getFeatures(CoordName=chrom, Start=start-pad, End=end+pad, feature_types='gene')) return genes
def main(job_no, coord_name, start, end, species, release, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) ig_count, sequence_length = 0, 0 genome = Genome(species, release=release, account=account, pool_recycle=3600) gene_count = 0 gene_intervals = list() genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types='gene') for gene in genes: if gene.location.coord_name != coord_name: break gene_count += 1 gene_intervals.append((gene.location.start, gene.location.end)) gene_intervals = sorted(gene_intervals, key=lambda x: x[1]) intergenic = interval_complement(gene_intervals) intergenic_sequence = "" for ig_interval in intergenic: ig_count += 1 sequence_length += ig_interval[1] - ig_interval[0] region = genome.get_region(coord_name=coord_name, start=ig_interval[0], end=ig_interval[1]) intergenic_sequence = intergenic_sequence + 'XXXXXXXXXX' + str( region.seq) LOGGER.log_message( str(ig_count), label='Number of integenic intervals processed'.ljust(30)) LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30)) outfile_name = dir + '/intergenic_sequence_' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(intergenic_sequence, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, coord_name, start, end, species, release, folder): start_time = time() if not os.path.exists(folder): os.makedirs(folder) LOGGER.log_file_path = folder + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) dupl_introns, intron_count, sequence_length = 0, 0, 0 intron_list, human_list, species_list, intron_list = list(), list(), list( ), list() genome = Genome(species, release=release, account=account, pool_recycle=3600) genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types='gene') intron_sequence = 'X' for gene in genes: if gene.canonical_transcript.introns is None: continue for intron in gene.canonical_transcript.introns: if intron in intron_list: dupl_introns += 1 continue intron_list.append(intron) intron_count += 1 sequence_length += len(intron) intron_sequence = intron_sequence + 'XXXXXXXXXX' + str(intron.seq) outfile_name = folder + '/intronic_sequence' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(intron_sequence, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() LOGGER.log_message(str(dupl_introns), label='Number of duplicate introns rejected'.ljust(30)) LOGGER.log_message(str(intron_count), label='Number of introns processed'.ljust(30)) LOGGER.log_message(str(sequence_length), label='Total intron_length'.ljust(30)) duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def get_genes_from_location(ref, coords, pad=0): """Get genes from a set of genome coordinates. pad will add n bases to either side to expand area""" genome = Genome(Species=ref, Release=release, account=account) chrom, start, end, strand = coords genes = list( genome.getFeatures(CoordName=chrom, Start=start - pad, End=end + pad, feature_types='gene')) return genes
def get_sequence_from_location(species, coords): """Get sequence from a genomic location in an ensembl species genome.""" from cogent.db.ensembl import HostAccount, Genome, Compara, Species genome = Genome(Species=species, Release='87', account=None) chrom, start, end, strand = coords #print coords r = genome.getRegion(CoordName=str(chrom), Start=start, End=end, Strand=strand) return r.Seq
def main(input_directory, output, flank_size): args = locals() if not os.path.exists(output): os.makedirs(output) logfile_path = os.path.join(output, "mouse_germline.log") LOGGER.log_file_path = logfile_path LOGGER.log_message(str(args), label="vars") account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) mouse = Genome('mouse', release=88, account=account, pool_recycle=10000) input_path = os.path.abspath(input_directory) file_paths = get_files(input_path) for fn in file_paths: LOGGER.input_file(fn) gene_id = os.path.basename(fn).split('.')[0] print ("Acquiring variants from gene %s"%gene_id) gene = mouse.get_gene_by_stableid(stableid=gene_id) output_file = os.path.join(output, '%s.txt' % gene_id) start_time = time.time() num = 0 with open(output_file, mode='w') as out_file: LOGGER.output_file(output_file) try: variants = get_var_info(gene, gene_id, flank_size) for var in variants: record = var_records(str(var)) out_file.write('\t'.join(record) + '\n') num += 1 except AssertionError: print('for gene %s, the translated exon and CDs are different.' % gene_id) os.remove(output_file) print ("finish getting variants on gene %s" % gene_id) LOGGER.log_message("%s"%num, label="Number of SNPs recorded") print () print () print ('Done') #determine runtime duration = time.time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)")
def main(input_dir, output_datafile, flank_size, chroms, coord_range): args = locals() output_dir = os.path.dirname(output_datafile) if not os.path.exists(output_dir): os.makedirs(output_dir) logfile_path = os.path.join(output_dir, "logs/sample_ENU.log") LOGGER.log_file_path = logfile_path LOGGER.log_message(str(args), label="vars") account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) mouse = Genome('mouse', release=88, account=account, pool_recycle=10000) input_dir = os.path.abspath(input_dir) file_paths = get_files(input_dir) start_time = time.time() for fn in file_paths: with open(fn, mode='r') as input_file: LOGGER.input_file(fn) with open(output_datafile, mode='w') as output: LOGGER.output_file(output_datafile) first_line = input_file.readline() num = 0 for line in input_file: records = line.split('|') print('Variant %s' % records[0]) if not get_ENU_data(records, chroms, coord_range):### continue var_id, chromosome, coordinate, ref_base, var_base, effect = get_ENU_data(records, chroms, coord_range)### record = get_snp_data(var_id, chromosome, coordinate, ref_base, var_base, effect, mouse, int(flank_size)) if not record: continue output.write('\t'.join(record)+'\n') num += 1 print("num written", num) output.close() #determine runtime duration = time.time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)")
def dump_genes(ensembl_account, species, outpath, coord_names, release, limit): """Dump meta data table for genes from one species in release ENSEMBL_ACCOUNT and exits.""" ensembl_account = _get_account(ensembl_account) if len(species) > 1: msg = "dump_genes handles single species only" click.secho(msg, fg="red") sys.exit(-1) missing_species = missing_species_names(species) if missing_species: msg = [ "The following species names don't match an Ensembl record. " "Check spelling!", str(missing_species), "\nAvailable species are at this server are:", str(display_available_dbs(ensembl_account)), ] click.secho("\n".join(msg), fg="red") sys.exit(-1) if coord_names: chroms = load_coord_names(coord_names) else: chroms = None genome = Genome(species[0], release=release, account=ensembl_account) genes = _get_ref_genes(genome, chroms, limit) records = [] for g in genes: records.append([g.stableid, g.biotype, g.location, g.description]) if records: table = make_table( header=["stableid", "biotype", "location", "description"], rows=records ) table.write(outpath) click.secho("Wrote %d genes to %s" % (table.shape[0], outpath), fg="green") else: click.secho("No genes matching criteria", fg="blue")
def main(job_no, coord_name, start, end, species, release, var_set_id, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome(species, release=release, account=account, pool_recycle=3600) confirm_variation_set(genome, var_set_id) var_locations = get_variant_details(genome, coord_name, start, end) LOGGER.log_message(str(len(var_locations)), label='Length of var_locations list'.ljust(30)) outfile_name = dir + '/intergenic_variants_' + species + '_' + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(var_locations, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main_core(job_no, species, varfile_name=None, intronfile_name=None, release=89, n_jobs=5, dir='data'): global genome if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str(os.path.basename(__file__)) + '_' + job_no + ".log" start_time = time() LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(25)) LOGGER.log_message('Name = ' + numpy.__name__ + ', version = ' + numpy.__version__, label="Imported module".ljust(25)) LOGGER.log_message('Name = ' + cogent3.__name__ + ', version = ' + cogent3.__version__, label="Imported module".ljust(25)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(25)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome(species, release=release, account=account, pool_recycle=3600) human_seq_region_dict = dict( {'1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559, '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541, '14': 131547, '15': 131558, '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543, '22': 131557, 'X': 131539, 'Y': 131553}) chimp_seq_region_dict = dict({"21": 212405, "7": 212407, "15": 212409, "16": 212395, "1": 212403, "17": 212411, "18": 212410, "19": 212394, "20": 212404, "22": 212390, "3": 212392, "4": 212393, "5": 212391, "6": 212388, "8": 212397, "9": 212396, "10": 212387, "11": 212389, "12": 212402, "13": 212408, "14": 212401, "Y": 212406, "X": 212399}) if species == 'human': coord_dict = dict([(v, k) for k, v in human_seq_region_dict.items()]) tag = 'human' elif species == 'chimp': coord_dict = dict([(v, k) for k, v in chimp_seq_region_dict.items()]) tag = 'spec_' else: assert False, 'Unknown species: ' + species if varfile_name is None: varfile_name = dir + '/var_locations_' + tag + job_no + '.pklz' infile = open(varfile_name, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(varfile_name, 'rb') as var_details: var_details = pickle.load(var_details) LOGGER.log_message(str(len(var_details)), label="Number of variants read".ljust(25)) if intronfile_name is None: intronfile_name = dir + '/all_locations_' + tag + job_no + '.pklz' infile = open(intronfile_name, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(intronfile_name, 'rb') as intron_locs: intron_locs = pickle.load(intron_locs) LOGGER.log_message(str(len(intron_locs)), label="Number of introns read".ljust(25)) with gzip.open(intronfile_name, 'rb') as intron_locs: intron_locs = pickle.load(intron_locs) var_details, var_locs_reversed = check_variant_strand(var_details, intron_locs) # var_details fields are: (variant name, seq region id, location, ancestral_allele, derived_allele) item_list = Parallel(n_jobs=n_jobs)(delayed(get_contexts) (var, coord_dict) for var in var_details) var_count_dict = Counter(item_list) del var_count_dict[None] outfile_name = dir + '/var_dict_' + tag + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(var_count_dict, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(25))
def main(job_no, coord_name, start, end, species, release, var_set_id, filter, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) var_locations_list, location_list = list(), list() dupl_introns, intron_count, bad_var_count, sequence_length = 0, 0, 0, 0 intron_list, human_list, species_list, intron_list = list(), list(), list( ), list() genome = Genome(species, release=release, account=account, pool_recycle=3600) confirm_variation_set(genome, var_set_id) genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types='gene') for gene in genes: if gene.canonical_transcript.introns is None: continue for intron in gene.canonical_transcript.introns: if intron in intron_list: dupl_introns += 1 continue intron_list.append(intron) intron_length = len(intron) intron_count += 1 sequence_length += intron_length loc = intron.location location_list.append( (str(loc.coord_name), loc.start, loc.end, loc.strand)) # location.coord_name is db3util object var_locations, bad_var_num = get_variant_details( genome, species, intron, filter) var_locations_list = var_locations_list + var_locations bad_var_count += bad_var_num LOGGER.log_message(str(dupl_introns), label='Number of duplicate introns rejected'.ljust(30)) LOGGER.log_message(str(intron_count), label='Number of introns processed'.ljust(30)) if species == 'human': LOGGER.log_message(str(bad_var_count), label='Number of rejected variants'.ljust(30)) LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30)) LOGGER.log_message(str(len(var_locations_list)), label='Length of var_locations list'.ljust(30)) LOGGER.log_message(str(len(var_locations_list) / sequence_length), label='Average SNV rate'.ljust(30)) outfile_name = dir + '/var_locations_' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(var_locations_list, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() outfile_name = dir + '/all_locations_' + species + job_no + '.pklz' with gzip.open(outfile_name, 'wb') as outfile: pickle.dump(location_list, outfile) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def one2one( ensembl_account, species, release, outdir, ref, ref_genes_file, coord_names, not_strict, introns, method_clade_id, mask_features, logfile_name, limit, force_overwrite, test, ): """Command line tool for sampling homologous sequences from Ensembl.""" outdir = abspath(outdir) if not any([ref, ref_genes_file]): # just the command name, indicate they need to display help click.secho("Missing 'ref' and 'ref_genes_file'") ctx = click.get_current_context() msg = "%s\n\n--help to see all options\n" % ctx.get_usage() click.echo(msg) exit(-1) ensembl_account = _get_account(ensembl_account) args = locals() args["ensembl_account"] = str(ensembl_account) LOGGER.log_message(str(args), label="params") if test and limit == 0: limit = 2 else: limit = limit or None if (introns and not method_clade_id) or (mask_features and not introns): msg = [ "Must specify the introns and method_clade_id in order to", "export introns. Use show_align_methods to see the options", ] click.secho("\n".join(msg), fg="red") exit(-1) species_missing = missing_species_names(species) if species_missing: msg = [ "The following species names don't match an Ensembl record." " Check spelling!", str(species_missing), "\nAvailable species are at this server are:", str(display_available_dbs(ensembl_account)), ] click.secho("\n".join(msg), fg="red") exit(-1) if ref: ref = ref.lower() if ref and ref not in species: print("The reference species not in species names") exit(-1) compara = Compara(species, release=release, account=ensembl_account) runlog_path = os.path.join(outdir, logfile_name) if os.path.exists(runlog_path) and not force_overwrite: msg = [ "Log file (%s) already exists!" % runlog_path, "Use force_overwrite or provide logfile_name", ] click.secho("\n".join(msg), fg="red") exit(-1) if not test: LOGGER.log_file_path = runlog_path chroms = None if coord_names: chroms = load_coord_names(coord_names) LOGGER.input_file(coord_names) elif coord_names and ref: chroms = get_chrom_names(ref, compara) if not os.path.exists(outdir) and not test: os.makedirs(outdir) print("Created", outdir) if ref and not ref_genes_file: ref_genome = Genome(ref, release=release, account=ensembl_account) ref_genes = [g.stableid for g in _get_ref_genes(ref_genome, chroms, limit)] else: if not (ref_genes_file.endswith(".csv") or ref_genes_file.endswith(".tsv")): msg = ( "ref_genes_file must be either a comma/tab " "delimted with the corresponding suffix (.csv/.tsv)" ) click.secho(msg, fg="red") exit(-1) ref_genes = load_table(ref_genes_file) if "stableid" not in ref_genes.header: msg = "ref_genes_file does not have a 'stableid' column header" click.secho(msg, fg="red") exit(-1) ref_genes = ref_genes.tolist("stableid") if limit: ref_genes = ref_genes[:limit] if not introns: print("Getting orthologs %d genes" % len(ref_genes)) get_one2one_orthologs( compara, ref_genes, outdir, not_strict, force_overwrite, test ) else: print("Getting orthologous introns for %d genes" % len(ref_genes)) get_syntenic_alignments_introns( compara, ref_genes, outdir, method_clade_id, mask_features, outdir, force_overwrite, test, )
def main(job_no, chrom, sex, species, release, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) human_seq_region_dict = dict({ '1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559, '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541, '14': 131547, '15': 131558, '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543, '22': 131557, 'X': 131539, 'Y': 131553 }) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome(species, release=release, account=account, pool_recycle=3600) variation_table = genome.VarDb.get_table('variation') variation_feature_table = genome.VarDb.get_table('variation_feature') var_table = variation_table.join( variation_feature_table, variation_feature_table.c.variation_id == variation_table.c.variation_id) seq_region_id = human_seq_region_dict[chrom] file_name = sex + '_noncarrier-hg38.csv' infile = open(file_name, 'r') LOGGER.input_file(infile.name) infile.close() recombination_df = pd.read_csv(file_name, usecols=[0, 1, 2, 3, 4]) recomb_df = recombination_df.loc[lambda df: df.chr == 'chr' + chrom, :] recomb_df = recomb_df.reset_index(drop=True) mut_profiles = [ i[0] + '->' + i[1] for i in permutations(['C', 'T', 'A', 'G'], 2) ] counts = np.zeros((recomb_df.shape[0], 21)) counts = pd.DataFrame(counts, columns=mut_profiles + ['C', 'T', 'A', 'G', 'SW', 'WS', 'SS', 'WW', 'CpG']) for index, row in recomb_df.iterrows(): midpoint = row.loc['pos38'] region = genome.get_region(coord_name=chrom, start=midpoint - 5000, end=midpoint + 5000, ensembl_coord=True) region = str(region.seq) whereclause1 = and_( var_table.c.variation_feature_seq_region_id == seq_region_id, var_table.c.variation_feature_class_attrib_id == 2, var_table.c.variation_feature_evidence_attribs.contains('370'), var_table.c.variation_feature_variation_name.contains('rs'), var_table.c.variation_feature_somatic == 0, var_table.c.variation_feature_alignment_quality == decimal.Decimal(1), var_table.c.variation_feature_minor_allele_freq.isnot(None), var_table.c.variation_feature_seq_region_start > midpoint - 5000, var_table.c.variation_feature_seq_region_start < midpoint + 5000) var_table_ed = var_table.select(whereclause1, use_labels=True) for snp in var_table_ed.execute(): if snp['variation_ancestral_allele'] is None: continue else: ancestral_allele = snp['variation_ancestral_allele'] alleles = snp['variation_feature_allele_string'] if fnmatch(alleles, ancestral_allele + '/?'): derived_allele = alleles[2] elif fnmatch(alleles, '?/' + ancestral_allele): derived_allele = alleles[0] else: continue mtype = ancestral_allele + '->' + derived_allele counts.loc[index, mtype] += 1 rel_loc = snp[ 'variation_feature_seq_region_start'] - midpoint + 5000 if (region[rel_loc + 1] == 'G' and ancestral_allele == 'C' and derived_allele == 'T') or \ (region[rel_loc - 1] == 'C' and ancestral_allele == 'G' and derived_allele == 'A'): counts.loc[index, 'CpG'] += 1 if ancestral_allele + derived_allele in ['CT', 'CA', 'GT', 'GA']: counts.loc[index, 'SW'] += 1 if ancestral_allele + derived_allele in ['TC', 'AC', 'TG', 'AG']: counts.loc[index, 'WS'] += 1 if ancestral_allele + derived_allele in ['CG', 'GC']: counts.loc[index, 'SS'] += 1 if ancestral_allele + derived_allele in ['TA', 'AT']: counts.loc[index, 'WW'] += 1 base_counts = Counter(region) for base in ['C', 'T', 'A', 'G']: counts.loc[index, base] = base_counts[base] results = pd.concat([recomb_df, counts], axis=1) csv_filename = 'recomb_table_SW_' + sex + '_ch' + chrom + '.csv' results.to_csv(csv_filename) outfile = open(csv_filename, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main(job_no, infile_name, release, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' + sqlalchemy.__version__, label="Imported module".ljust(30)) human_seq_region_dict = dict({ '1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559, '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541, '14': 131547, '15': 131558, '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543, '22': 131557, 'X': 131539, 'Y': 131553 }) account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) genome = Genome('human', release=release, account=account, pool_recycle=3600) variation_feature_table = genome.VarDb.get_table('variation_feature') id_1KG = set([str(x) for x in range(42, 55)]) var_details = pd.read_csv(infile_name, sep=',', index_col=0) infile = open(infile_name, 'r') LOGGER.input_file(infile.name) infile.close() loc_count, match_count, count1KG, derived_mismatch_count = 0, 0, 0, 0 col_alleles, col_name, col_val_id = list(), list(), list() for row in var_details.iterrows(): chrom = row[1].loc['chr'] chrom = chrom[3:] seq_region_id = human_seq_region_dict[chrom] loc38 = row[1].loc['pos38'] loc_count += 1 whereclause1 = and_( variation_feature_table.c.seq_region_id == seq_region_id, variation_feature_table.c.seq_region_start == loc38, variation_feature_table.c.class_attrib_id == 2, variation_feature_table.c.variation_name.contains("rs"), variation_feature_table.c.somatic == 0, variation_feature_table.c.alignment_quality == decimal.Decimal(1), variation_feature_table.c.minor_allele_freq.isnot(None)) query = select([ variation_feature_table.c.variation_name, variation_feature_table.c.allele_string, variation_feature_table.c.variation_set_id ], whereclause1) snps = list(query.execute()) if len(snps) > 0: if len(snps) > 1: print('More than one SNP at ', chrom, ':', loc38) alleles = snps[0][1] name = snps[0][0] match_count += 1 if len(set(snps[0][2]).intersection(id_1KG)) > 0: val_id = '1KG' count1KG += 1 else: val_id = 'Other' else: val_id = 'No match' name = None alleles = None col_alleles.append(alleles) col_name.append(name) col_val_id.append(val_id) assert var_details.shape[0] == len(col_val_id), 'Column mismatch.' var_details['alleles'] = pd.Series(col_alleles) var_details['name'] = pd.Series(col_name) var_details['val_id'] = pd.Series(col_val_id) LOGGER.log_message(str(loc_count), label='Variants read = ') LOGGER.log_message(str(derived_mismatch_count), label='Derived mismatches = ') LOGGER.log_message(str(match_count), label='Variants matched = ') LOGGER.log_message(str(count1KG), label='1KG Variants = ') filename = 'data/dnms_from_PRJEB21300_matched_' + job_no + '.csv' var_details.to_csv(filename) outfile = open(filename, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))