def run(self, shortIntronSize=30): self.getAugustusTranscriptDict() self.getTranscriptDict() classify_dict = {} details_dict = defaultdict(list) for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if psl_lib.remove_augustus_alignment_number( aug_aId) not in self.transcriptDict: continue t = self.transcriptDict[psl_lib.remove_augustus_alignment_number( aug_aId)] if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome: continue aug_t_intervals = aug_t.exonIntervals merged_t_intervals = seq_lib.gap_merge_intervals( t.exonIntervals, gap=shortIntronSize) for interval in aug_t_intervals: if seq_lib.interval_not_intersect_intervals( merged_t_intervals, interval): classify_dict[aug_aId] = 1 details_dict[aug_aId].append( interval.get_bed(self.rgb, "/".join([self.column, aug_aId]))) if aug_aId not in classify_dict: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def run(self): self.getAugustusTranscriptDict() self.getTranscriptDict() classify_dict = {} details_dict = {} for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if psl_lib.remove_augustus_alignment_number( aug_aId) not in self.transcriptDict: continue t = self.transcriptDict[psl_lib.remove_augustus_alignment_number( aug_aId)] if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome or t.thickStart == t.thickStop: continue if t.thickStart != aug_t.thickStart or t.thickStop != aug_t.thickStop: classify_dict[aug_aId] = 1 s = aug_t.getCdsLength() if s > 9: details_dict[aug_aId] = [ seq_lib.cds_coordinate_to_bed(aug_t, 0, 3, self.rgb, self.column), seq_lib.cds_coordinate_to_bed(aug_t, s - 3, s, self.rgb, self.column) ] else: details_dict[aug_aId] = seq_lib.cds_coordinate_to_bed( aug_t, 0, s, self.rgb, self.column) else: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def run(self): self.getAugustusTranscriptDict() self.getTranscriptDict() classify_dict = {} details_dict = defaultdict(list) for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if psl_lib.remove_augustus_alignment_number(aug_aId) not in self.transcriptDict: continue t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(aug_aId)] if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome: classify_dict[aug_aId] = 1 details_dict[aug_aId] = seq_lib.transcript_to_bed(aug_t, self.rgb, self.column) else: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def augustus_transcript_transmap_iterator(self): if self.transcript_dict is None: self.get_transcript_dict() for aug_id, aug_t in self.augustus_transcript_iterator(): t = self.transcript_dict[psl_lib.remove_augustus_alignment_number( aug_id)] yield aug_id, aug_t, t
def run(self): r = re.compile("-[0-9]+-") self.getAugustusTranscriptDict() counts = Counter("-".join(r.split(aug_aId)) for aug_aId in self.augustusTranscriptDict) details_dict = {} classify_dict = {} for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if counts[psl_lib.remove_augustus_alignment_number(aug_aId)] > 1: details_dict[aug_aId] = seq_lib.transcript_to_bed( aug_t, self.rgb, self.column + "_{}_Copies".format(counts[psl_lib.remove_augustus_alignment_number(aug_aId)] - 1), ) classify_dict[aug_aId] = 1 else: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def run(self): r = re.compile("-[0-9]+-") self.getAugustusTranscriptDict() counts = Counter("-".join(r.split(aug_aId)) for aug_aId in self.augustusTranscriptDict) details_dict = {} classify_dict = {} for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if counts[psl_lib.remove_augustus_alignment_number(aug_aId)] > 1: details_dict[aug_aId] = seq_lib.transcript_to_bed( aug_t, self.rgb, self.column + "_{}_Copies".format(counts[ psl_lib.remove_augustus_alignment_number(aug_aId)] - 1)) classify_dict[aug_aId] = 1 else: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def run(self): self.getAugustusTranscriptDict() self.getTranscriptDict() classify_dict = {} details_dict = defaultdict(list) for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if psl_lib.remove_augustus_alignment_number( aug_aId) not in self.transcriptDict: continue t = self.transcriptDict[psl_lib.remove_augustus_alignment_number( aug_aId)] if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome: classify_dict[aug_aId] = 1 details_dict[aug_aId] = seq_lib.transcript_to_bed( aug_t, self.rgb, self.column) else: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def run(self, shortIntronSize=30): self.getAugustusTranscriptDict() self.getTranscriptDict() classify_dict = {} details_dict = defaultdict(list) for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if psl_lib.remove_augustus_alignment_number(aug_aId) not in self.transcriptDict: continue t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(aug_aId)] if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome: continue aug_t_intervals = aug_t.exonIntervals merged_t_intervals = seq_lib.gap_merge_intervals(t.exonIntervals, gap=shortIntronSize) for interval in aug_t_intervals: if seq_lib.interval_not_intersect_intervals(merged_t_intervals, interval): classify_dict[aug_aId] = 1 details_dict[aug_aId].append(interval.get_bed(self.rgb, "/".join([self.column, aug_aId]))) if aug_aId not in classify_dict: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def initializeDb(self, dbPath, classifiers, dataType=None): if dataType is None: columnDefinitions = [[x.__name__, x.dataType()] for x in classifiers] else: columnDefinitions = [[x.__name__, dataType] for x in classifiers] # find alignment IDs from PSLs (primary key for database) for genome, gp in izip(self.genomes, self.augustusGps): aug_aIds = set(x.split()[11] for x in open(gp)) aIds = [psl_lib.remove_augustus_alignment_number(x) for x in aug_aIds] self.initializeSqlTable(dbPath, genome, columnDefinitions, self.primaryKeyColumn) self.initializeSqlRows(dbPath, genome, aug_aIds, self.primaryKeyColumn) self.buildNameRow(dbPath, genome, aug_aIds, aIds, self.primaryKeyColumn)
def is_tie(best_alns): """ If we have more than one best transcript, is at least one from transMap and one from Augustus? """ seen = set() for aln_id in best_alns: ens_id = psl_lib.remove_augustus_alignment_number(aln_id) if ens_id in seen: return True else: seen.add(ens_id) return False
def run(self): self.getAugustusTranscriptDict() self.getTranscriptDict() classify_dict = {} details_dict = {} for aug_aId, aug_t in self.augustusTranscriptDict.iteritems(): if psl_lib.remove_augustus_alignment_number(aug_aId) not in self.transcriptDict: continue t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(aug_aId)] if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome or t.thickStart == t.thickStop: continue if t.thickStart != aug_t.thickStart or t.thickStop != aug_t.thickStop: classify_dict[aug_aId] = 1 s = aug_t.getCdsLength() if s > 9: details_dict[aug_aId] = [ seq_lib.cds_coordinate_to_bed(aug_t, 0, 3, self.rgb, self.column), seq_lib.cds_coordinate_to_bed(aug_t, s - 3, s, self.rgb, self.column), ] else: details_dict[aug_aId] = seq_lib.cds_coordinate_to_bed(aug_t, 0, s, self.rgb, self.column) else: classify_dict[aug_aId] = 0 self.dumpValueDicts(classify_dict, details_dict)
def initializeDb(self, dbPath, classifiers, dataType=None): if dataType is None: columnDefinitions = [[x.__name__, x.dataType()] for x in classifiers] else: columnDefinitions = [[x.__name__, dataType] for x in classifiers] # find alignment IDs from PSLs (primary key for database) for genome, gp in izip(self.genomes, self.augustusGps): aug_aIds = set(x.split()[11] for x in open(gp)) aIds = [ psl_lib.remove_augustus_alignment_number(x) for x in aug_aIds ] self.initializeSqlTable(dbPath, genome, columnDefinitions, self.primaryKeyColumn) self.initializeSqlRows(dbPath, genome, aug_aIds, self.primaryKeyColumn) self.buildNameRow(dbPath, genome, aug_aIds, aIds, self.primaryKeyColumn)
def database(genome, db, db_path, tmp_dir, mode): data_dict = {} mkdir_p(os.path.dirname(db_path)) data_path = os.path.join(tmp_dir, db) for col in os.listdir(data_path): p = os.path.join(data_path, col) with open(p) as p_h: data_dict[col] = pickle.load(p_h) if mode == "reference": index_label = "TranscriptId" elif mode == "transMap": index_label = "AlignmentId" else: index_label = "AugustusAlignmentId" # Hack to add transMap alignment ID column to Augustus databases. aug_ids = data_dict.itervalues().next().viewkeys() data_dict["AlignmentId"] = { x: psl_lib.remove_augustus_alignment_number(x) for x in aug_ids } sql_lib.write_dict(data_dict, db_path, genome, index_label)
def align(target, g, target_fasta, chunk, ref_fasta, out_path): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] for aug_aId in chunk: aId = remove_augustus_alignment_number(aug_aId) gencode_id = remove_alignment_number(aId) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[aug_aId]) tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode") fastaWrite(tmp_aug, aug_aId, aug_seq) fastaWrite(tmp_gencode, gencode_id, gencode_seq) r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug)) r = r.split("\n")[:-3] if len(r) == 0: results.append([aug_aId, "0", "0"]) else: p_list = [PslRow(x) for x in r] results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)])) with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf: for x in results: outf.write("\t".join(x) + "\n")
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_augustus_classifiers" base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n" "on the reference set {} with Augustus {}") out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus") highest_cov_dict = sql_lib.highest_cov_aln(cur, genome) highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0]) sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", table="augustus") base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids} for mode in ["1", "2"]: i = "I{}".format(mode) aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less" filter_set = {x for x in base_filter_set if i in x} out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i)) barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode) munged, stats = munge_data(sql_data, filter_set) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def align(target, target_fasta, chunk, ref_fasta, file_tree): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] tmp_aug = os.path.join(target.getGlobalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getGlobalTempDir(), "tmp_gencode") tmp_psl = os.path.join(target.getGlobalTempDir(), "tmp_psl") with open(tmp_aug, "w") as tmp_aug_h, open(tmp_gencode, "w") as tmp_gencode_h: for tgt_id in chunk: query_id = remove_augustus_alignment_number(tgt_id) gencode_id = remove_alignment_number(query_id) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[tgt_id]) fastaWrite(tmp_aug_h, tgt_id, aug_seq) fastaWrite(tmp_gencode_h, gencode_id, gencode_seq) system("blat {} {} -out=psl -noHead {}".format(tmp_aug, tmp_gencode, tmp_psl)) r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] r_d = defaultdict(list) for p in tokenize_stream(r): psl = PslRow(p) r_d[psl.t_name].append(psl) assert len(r_d.viewkeys() & set(chunk)) > 0, (r_d.viewkeys(), set(chunk)) for tgt_id in chunk: if tgt_id not in r_d: results.append([tgt_id, query_id, "0", "0"]) else: p_list = [[min(x.coverage, x.target_coverage), x.identity] for x in r_d[tgt_id]] best_cov, best_ident = sorted(p_list, key=lambda x: x[0])[-1] results.append(map(str, [tgt_id, query_id, best_cov, best_ident])) with open(file_tree.getTempFile(), "w") as outf: for x in results: outf.write("".join([",".join(x), "\n"]))
def strip_alignment_numbers(aln_id): """ Convenience function for stripping both Augustus and transMap alignment IDs from a aln_id """ return remove_alignment_number(remove_augustus_alignment_number(aln_id))
seq_dict = seq_lib.get_sequence_dict(target_fasta) ref_seq_dict = seq_lib.get_sequence_dict(ref_fasta) con, cur = sql_lib.attach_databases("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/comparativeAnnotation/2015-10-12/GencodeBasicV23", mode="augustus") genome = 'gorilla' ref_genome = 'human' biotype = 'protein_coding' filter_chroms = ["Y", "chrY"] stats = merge_stats(cur, 'gorilla') highest_cov_dict = sql_lib.highest_cov_aln(cur, "gorilla") highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0]) biotype_ids = sql_lib.get_biotype_aln_ids(cur, 'gorilla', 'protein_coding') highest_cov_ids &= biotype_ids best_stats = {x: y for x, y in stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids} best_tm = {x: y for x, y in best_stats.iteritems() if x in highest_cov_ids} best_aug = {x: y for x, y in best_stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids and x not in highest_cov_ids} r = {"higher_cov": [], "higher_ident": [], "higher_both": [], "worse": []} for aug_id in best_aug: aug_cov, aug_ident = best_aug[aug_id] tm_cov, tm_ident = best_tm[psl_lib.remove_augustus_alignment_number(aug_id)] if aug_cov > tm_cov and aug_ident > tm_ident: r["higher_both"].append(aug_id) elif aug_cov > tm_cov: r["higher_cov"].append(aug_id) elif aug_ident > tm_ident: r["higher_ident"].append(aug_id) else: r["worse"].append(aug_id)