class LibIteratorCls: def __init__(self, lib_list_path, ProcessLibraryMethod, options, log_info=None): self.lib_list_path = lib_list_path self.ProcessLibrary = ProcessLibraryMethod self.options = options self.CheckOptions() #if (not hasattr(options, "no_path_check")): # self.options.no_path_check = False # end if self.log_info = log_info self.num_libs = 0 self.list_of_paths = False # end def def __del__(self): if (hasattr(self, "lib_list_file") and None != self.lib_list_file and not self.lib_list_file.closed): self.lib_list_file.close() # end if # end def def CheckOptions(self): #{ required_opts = ["no_path_check", "only_pass"] for opt in required_opts: #{ if (not hasattr(self.options, opt)): #{ setattr(self.options, opt, False) #} end if #} end for if (not hasattr(self.options, "get_paths")): self.options.get_paths = True #} end if #} end def def IterateOverAllLibs(self): self.num_libs = 0 self.lib_list_file = FileBoxCls(self.lib_list_path, "r", "could not open library list file") for lib_line in self.lib_list_file: # skip comment lines if (lib_line.startswith("#")): continue # end if lib_info = LibraryInfoCls(self.options, self.log_info, self.list_of_paths) lib_info.GetLibDir(lib_line) DebugMsg(self, "Lib Dir: %s" % lib_info.lib_dir) if (self.options.get_paths and not self.list_of_paths): lib_info.GetEventPaths() # end if if (1 > len(lib_info.event_paths) and (self.options.get_paths or self.list_of_paths)): raise LibIteratorError("could not get event path(s) from directory: " "%s" % lib_info.lib_dir) # end if self.ProcessLibrary(lib_info) self.num_libs += 1 # end for self.lib_list_file.close()
class FastaFileCls: #{ def __init__(self, path, fail_msg="cannot open fasta file", log_info=None, line_delim="", maintain_case=False): #{ self.file = FileBoxCls(path, "r", fail_msg) self.line_delim = line_delim self.curr_line = None self.log_info = log_info self.finished = False self.maintain_case = maintain_case #} end def def __del__(self): #{ self.file.Close() #} end def def __iter__(self): #{ return self #} end def def next(self): #{ if (self.finished): #{ raise StopIteration #} end if new_seq = None try: #{ if (None == self.curr_line): #{ self.curr_line = self.file.next() #} end if if (not self.curr_line.startswith(">")): #{ raise FastaError("improperly formatted fasta file: sequence id line " "must begin with \">\": \"%s\"." % self.curr_line) #} end if if (" " in self.curr_line): #{ (seq_id, seq_extra) = self.curr_line.lstrip(">").split(" ", 1) else: seq_id = self.curr_line.lstrip(">") seq_extra = None #} end if new_seq = SequenceCls(seq_id, seq_extra) self.curr_line = self.file.next() while (not self.curr_line.startswith(">")): #{ if ("" != new_seq.sequence): #{ new_seq.sequence += self.line_delim #} end if new_seq.sequence += self.curr_line try: #{ self.curr_line = self.file.next() except StopIteration: self.finished = True break #} end try #} end while if (not self.maintain_case): #{ new_seq.sequence = new_seq.sequence.upper() #} end if return new_seq except StopIteration, e: self.finished = True raise e
def PrintErrors(self): # { errors = False fail_msg = "could not open samtools error file" err_file = FileBoxCls(self.err_file_path, "r", fail_msg) for line in err_file: # { LogMsg(self, line) errors = True # } end for err_file.close() return errors
def Setup(self): #{ fail_msg = "cannot open groups file" self.groups_file = FileBoxCls(self.options.barnacle_path, "r", fail_msg) output_file_path = self.options.barnacle_path.replace(".data", ".out") fail_msg = "cannot create pair-to-genome support output file" self.output_file = FileBoxCls(output_file_path, "w", fail_msg) # create samtools object and check whether to use "chr" in chromosome IDs samtools = SAMToolsCls(self.options.p2g_path, self.options, log_info=self.log_info) self.options.use_chr = samtools.ShouldChromUseChr()
def CreateQueryFile(self): #{ LogMsg(self, "Creating query file...") query_file = FileBoxCls(self.query_path, "w", "cannot create query " "contig sequences file") all_contigs_file = FileBoxCls(self.options.ctg_seq_path, "r", "cannot read contig sequences file") seqs_found = False num_written = 0 for id_line in all_contigs_file: #{ seq_line = all_contigs_file.next() if (not id_line.startswith(">")): #{ raise RealignerError("invalid contig id line in sequece file:\n%s" % id_line) #} end if # extract the contig id from the line ctg_id = id_line.lstrip(">").split()[0] #DebugMsg(self, "Contig ID from sequence file: %s" % ctg_id) # if the contig is represented in one of the potential predictions if (ctg_id in self.contigs): #{ #DebugMsg(self, "Writing sequence to query file.") # write it to the query file query_file.WriteLine(id_line) query_file.WriteLine(seq_line) seqs_found = True num_written += 1 self.contigs[ctg_id].written = True self.contigs[ctg_id].sequence = seq_line.lower() #if ("itd" in self.contigs[ctg_id].types): #{ # self.contigs[ctg_id].sequence = seq_line.lower() #} end if self.missing.discard(ctg_id) #} end if #if (ctg_id in self.contig_seqs): #{ # self.contig_seqs[ctg_id] = seq_line #} end if #} for if (not seqs_found): #{ raise RealignerError("could not find any contig sequences in %s" % self.options.ctg_seq_path) #} end if if (num_written != len(self.contigs)): #{ #missed = list() #for contig in self.contigs.itervalues(): #{ # if (not contig.written): #{ # missed.append(contig.id) # #} end if #} end for LogMsg(self, "WARNING: only wrote %i of %i contig sequences! " % (num_written, len(self.contigs)) + "Missing: %s" % ",".join(sorted(self.missing))) # ",".join(missed)) #} end if all_contigs_file.Close() query_file.Close()
def PrintFilters(self): #{ filter_file = FileBoxCls(self.OutputPath("filters"), "w", "could not open filter file") try: #{ for filter_name in sorted(self.filters.keys()): #{ #LogMsg(self, "%s: %s" % # (self.filters[filter_name].description, # self.filters[filter_name].ValueString())) filter_file.WriteLine("%s: %s" % (self.filters[filter_name].description, self.filters[filter_name].ValueString())) #} end for finally: filter_file.close()
def __init__(self, data_file_path, keep_lines=False, check_data=False): #{ CheckFilePath(data_file_path, "candidate group file") self.group_parser = GroupParserCls(keep_lines=keep_lines) self.check_data = check_data fail_message = "cannot open data file" self.data_file = FileBoxCls(data_file_path, "r", fail_message) self.groups = list()
def CheckStatus(self): #{ fail_msg = \ "cannot open output file for job number %s" % self.num output_file = FileBoxCls(self.output_path, "r", fail_msg) self.status = "in progress" for output_line in output_file: #{ if (R2C_SUCCESS == output_line): #{ self.status = "complete" break elif ("" == output_line or R2C_FAIL == output_line): #{ self.status = "failed" break #} end if #} end for output_file.close() return
def IterateOverAllLibs(self): self.num_libs = 0 self.lib_list_file = FileBoxCls(self.lib_list_path, "r", "could not open library list file") for lib_line in self.lib_list_file: # skip comment lines if (lib_line.startswith("#")): continue # end if lib_info = LibraryInfoCls(self.options, self.log_info, self.list_of_paths) lib_info.GetLibDir(lib_line) DebugMsg(self, "Lib Dir: %s" % lib_info.lib_dir) if (self.options.get_paths and not self.list_of_paths): lib_info.GetEventPaths() # end if if (1 > len(lib_info.event_paths) and (self.options.get_paths or self.list_of_paths)): raise LibIteratorError("could not get event path(s) from directory: " "%s" % lib_info.lib_dir) # end if self.ProcessLibrary(lib_info) self.num_libs += 1 # end for self.lib_list_file.close()
def CreateAlignCoordsFile(self, aligns): #{ DebugMsg(self, "Creating new alignment coordinates file...") # open the alignment coordinates file fail_msg = "Cannot open alignment coordinates file" align_coords_file = FileBoxCls(self.paths['align_coords'], "w", fail_msg) # iterate through the alignments for id, align in enumerate(aligns): #{ align = FixAlign(align) # REMINDER: use alignment blocks instead! WriteBlockCoords(align, id, align_coords_file, use_chr=True) #coord_str = "%s %i %i %i" % (align.target, # min(align.tstart, align.tend), max(align.tstart, align.tend), # id) #align_coords_file.write(coord_str + "\n") #} end for align_coords_file.close()
def CheckStatus(self): #{ ExtremeDebugMsg(self, "Checking job status: %s" % self.output_path) fail_msg = ("cannot open output file for job number %s" % self.num) output_file = FileBoxCls(self.output_path, "r", fail_msg) self.status = "in progress" for output_line in output_file: #{ ExtremeDebugMsg(self, " %s" % output_line) if (CID_SUCCESS == output_line): #{ self.status = "complete" break elif ("" == output_line or CID_FAIL == output_line): #{ self.status = "failed" break #} end if #} end for output_file.close() return
def __init__(self, path, fail_msg="cannot open fasta file", log_info=None, line_delim="", maintain_case=False): #{ self.file = FileBoxCls(path, "r", fail_msg) self.line_delim = line_delim self.curr_line = None self.log_info = log_info self.finished = False self.maintain_case = maintain_case
def IntegrateP2GFile(self, p2g_path): #{ DebugMsg(self, "Integrating pair-to-genome file: %s" % p2g_path) group = None fail_msg = "cannot open pair-to-genome results file" p2g_file = FileBoxCls(p2g_path, "r", fail_msg) for p2g_line in p2g_file: #{ DebugMsg(self, "LINE: %s" % p2g_line) # count the group self.num_groups += 1 # parse the pair-to-genome line p2g_support = P2GGroupCls(self.options, self.log_info) p2g_support.ParseSupportString(p2g_line) # check that the group had some reads at least if (1 > p2g_support.num_reads): #{ self.groups_without_reads.append("%i" % p2g_support.group_id) #} end if # get a group from the groups file if (None == group or p2g_support.group_id > group.id): try: #{ DebugMsg(self, "Getting next group...") group = self.group_parser.GetNextGroup() except StopIteration: raise P2GIntegratorError \ ("Unexpected end of groups file: %s\n while integrating: %s" % (self.group_parser.data_file_path, p2g_path)) #} end try # allow for groups having been removed from the groups file if (p2g_support.group_id < group.id): #{ continue #} end if # ensure that the group ids match up if (p2g_support.group_id != group.id): #{ raise P2GIntegratorError("Inconsistent group ids: %i from %s, " % (p2g_support.group_id, p2g_path) + "%i from %s" % (group.id, self.options.barnacle_path)) #} end if # add the pair-to-genome support to the group self.AddSupportToGroup(group, p2g_support) # apply any pair-to-genome filters given #self.ApplyFilters(group) # write the group to the new output file(s) self.WriteGroup(group) #} end for p2g_file.close()
def WriteCounts(self): #{ # open the counts file fail_msg = "Cannot open counts file" counts_file = FileBoxCls(self.paths['counts'], "w", fail_msg) # write the number of split alignments found counts_file.WriteLine("Split: %i" % len(self.candidate_contigs)) # if gapped alignments were also checked for if (self.options.check_gap): #{ # write the number of gapped alignments found msg = "Gapped: " if (self.more_than_99): #{ msg += "at least " #} end if msg += "%i" % self.num_gapped_aligns counts_file.WriteLine(msg) #} end if counts_file.WriteLine("COMPLETE") # close the counts file counts_file.close()
class R2CResultsFileCls: #{ def __init__(self, path, log_info=None): #{ self.log_info = log_info fail_msg = "cannot open read-to-contig support results file" self.file = FileBoxCls(path, "r", fail_msg) self.integrated = False self.curr_member = None #} end def def __del__(self): #{ self.file.Close() #} end def def BeforeGroup(self, group_id): #{ if (self.integrated): #{ return False #} end if if (None == self.curr_member or self.curr_member.group_id < group_id): #{ return True #} end if return False #} end def def GroupIsCurrent(self, group_id): #{ if (self.integrated or None == self.curr_member): #{ return False #} end if if (self.curr_member.group_id == group_id): #{ return True #} end if return False #} end def def GetMember(self): #{ if (self.integrated): #{ DebugMsg(self, "Not getting member, file already fully integrated.") return #} end if DebugMsg(self, "Getting member...") try: #{ member_line = self.file.next() # create a new member from the current line, store it as "curr_member" (member_id, support_list) = member_line.split(" ") self.curr_member = R2CMemberCls(member_id, log_info=self.log_info) # store support values self.curr_member.InitializeSupport(support_list) DebugMsg(self, "New member: %s" % self.curr_member.DebugString()) except StopIteration: DebugMsg(self, "Integrated all support from %s" % self.file.path) self.curr_member = None self.integrated = True self.file.Close() return
def GenerateEventReads(self): # { LogMsg(self, "Generating event reads...") start = time.time() seq_file = FastaFileCls(self.options.eseq_path, "cannot read sequences file") npairs_file = FileBoxCls(self.options.enreads_path, "w", "cannot create event read counts file") cov_file = FileBoxCls(self.options.ecov_path, "r", "cannot read event coverages file") # number of sequences from which reads were actually simulated nseqs_sim = 0 for seq_obj in seq_file: # { if len(seq_obj) <= self.options.frag_length: # { LogMsg( self, "Sequence %s shorter than fragment length: " "%i < %i" % (seq_obj.id, len(seq_obj), self.options.frag_length), ) continue # } end if nseqs_sim += 1 seq_obj.covered = False while not seq_obj.covered: # { cov_line = cov_file.next() # coverage = float(cov_line) + self.options.cov_adjust coverage = float(cov_line) nreads = coverage * (float(len(seq_obj)) / float(self.options.read_length)) npairs = IntFloor(float(nreads) / 2.0) if 1 > npairs: # { ExtremeDebugMsg(self, " coverage %.3f too low, no reads." % coverage) continue # } end if # coverage = nreads * self.options.read_length / len(seq_obj) self.SimulateReads(seq_obj, npairs, "e") # } end while npairs_file.WriteLine("%s %i %f" % (seq_obj.id, npairs, coverage)) # } end for cov_file.Close() npairs_file.Close() seq_file.Close() LogMsg(self, "Simulated reads from %i event sequences" % nseqs_sim) LogMsg(self, "Time spent generating event reads: %s" % TimeSpent(start))
class GTFAnnotationParserCls: #{ def __init__(self, input_path, log_info=None): #{ self.file = FileBoxCls(input_path, "r", "cannot read gene annotations input file") self.curr_feature = None self.log_info = log_info self.finished = False #} end def def __del__(self): #{ self.close() #} end def def __iter__(self): #{ return self #} end def def next(self): #{ if (self.finished): #{ raise StopIteration #} end if transcript = None try: #{ if (None == self.curr_feature): #{ self.ParseFeature() #} end if transcript = GTFTranscriptCls(name=self.curr_feature.name) while (self.curr_feature.name == transcript.name): #{ transcript.Update(self.curr_feature) self.ParseFeature() #} end while except StopIteration: self.finished = True #} end try if (None == transcript): #{ raise StopIteration #} end if transcript.CreateExonList() return transcript #} end def def ParseFeature(self): #{ #ExtremeDebugMsg(self, " Parsing feature from file...") try: #[ line = self.file.next() except StopIteration, e: self.curr_feature = None raise e #} end try tokenizer = TokenizerCls(line, delimiter="\t", log_info=self.log_info) self.curr_feature = GTFFeatureCls(tokenizer)
def CreateOverlapsFile(self): #{ if (self.options.use_existing_overlaps): #{ LogMsg(self, "Using existing breakpoint/transcript overlaps file.") else: LogMsg(self, "Running overlap code...") if (hasattr(self, "log_file") and None != self.log_file): #{ self.log_file.Flush() #} end if start = time.time() RunOverlapCode(self.options.breakpoint_exons, self.options.group_coords_path, self.options.overlaps_path, dpt=self.options.dpt) LogMsg(self, "Time spent running overlap code: %s" % TimeSpent(start)) #} end if self.overlaps_file = FileBoxCls(self.options.overlaps_path, "r", "cannot read exon/group overlaps file")
def __init__(self, path, type=None, log_info=None): #{ if (None == type): #{ type = GetAnnotationsType(path) #} end if if (type in PARSERS): #{ self.parser = PARSERS[type](path, log_info=log_info) self.file = None self.ParseLine = None elif (type in PARSE_FUNCTIONS): #{ self.parser = None self.file = FileBoxCls(path, "r", "cannot open %s annotations file" % type) self.ParseLine = PARSE_FUNCTIONS[type] else: raise GeneAnnotationError("cannot determine correct annotation parser " "from annotations type: %s" % type) #} end if self.log_info = log_info self.finished = False
class TopHatFileCls: # { def __init__(self, path, log_info=None): # { self.file = FileBoxCls(path, "r", "cannot read TopHat-Fusion results file") self.log_info = log_info # } end def def __del__(self): # { self.close() # } end def def __iter__(self): # { return self # } end def def next(self): # { # the first line should start with "allAtOnce" and # it contains the breakpoint coordinates # parse the tophat line tophat_event = TopHatEventCls(self.file.next()) # the next two lines should be "sequence" lines tophat_event.CheckSeqLine(self.file.next()) tophat_event.CheckSeqLine(self.file.next()) # the next lines should be... scores? tophat_event.CheckScoreLine(self.file.next()) # the next line should have the gene ids tophat_event.ParseGenesLine(self.file.next()) # skip the final line self.file.next() return tophat_event # } end def def close(self): # { if hasattr(self, "file") and None != self.file and not self.file.closed: # { self.file.close()
def IdentifyCandidateContigs(self, aligns): #{ # TEMP # ExtremeDebugMsg(self, AlignListString(aligns) # open the contig sequences file if using the gap filter if (self.options.check_gap): #{ fail_msg = "Cannot open contig sequence file" ctg_seq_file = FileBoxCls(self.paths['ctg_seq'], "r", fail_msg) else: ctg_seq_file = None #} end if # iterate over the alignments, grouping them by query (i.e. contig) contig_align_index = 0 while (contig_align_index < len(aligns)): #{ self.num_contigs += 1 contig = ContigWithAlignmentsCls(contig_align_index, aligns, ctg_seq_file, self.paths['gap_out'], self.options, self.log_info) ExtremeDebugMsg(self, "-"*80) #DebugMsg(self, "Grouping alignments for " # "%s (contig #%i)..." % (contig.id, self.num_contigs)) DebugMsg(self, "%i) %s" % (self.num_contigs, contig.id)) ExtremeDebugMsg(self, " Contig length: %i" % contig.length) #LogMsg(self, "Contig align index: %i" % contig_align_index) # Select the alignments to consider for the current contig # and check for gapped alignments at the same time contig.SelectAlignments(aligns) if (contig.single_align_found): #{ self.num_full_aligns += 1 #} end if if (self.options.check_gap and not contig.perfect_align_found): #{ contig.CheckGappedAlignments() self.gapped_psl_lines.extend(contig.gapped_psl_lines) self.num_gapped_aligns += contig.num_gapped_aligns #} end if if (self.params['check_split'] and not self.params['use_quick_chooser']): # pare down the alignment groups so that # only the best alignments remain contig.PareAlignmentGroups() #} end if #LogMsg(self, "# Gaps Found (Finder): %i" % # self.num_gapped_aligns) contig_align_index += contig.num_aligns_to_contig if (0 < len(contig.best_aligns)): #{ if (self.log_info['debug']): #{ LogMsg(self, "%i best aligns: %s" % (len(contig.best_aligns), contig.id)) ExtremeDebugMsg(self, AlignListString(contig.best_aligns)) #} end if elif (0 < len(contig.align_groups)): #{ if (self.log_info['debug']): #{ ExtremeDebugMsg(self, "-"*40) LogMsg(self, "%i align groups: %s" % (len(contig.align_groups), contig.id)) for i, group in enumerate(contig.align_groups): #{ ExtremeDebugMsg(self, "\n".join(["Group %i" % i, " %i) S:%i E:%i Aligns:%i" % (i, group.ctg_start, group.ctg_end, len(group.best_aligns)), AlignListString(group.best_aligns)])) #} end for #} end if else: # no best aligns or align groups found if (not contig.perfect_align_found and not contig.single_align_found): #{ DebugMsg(self, "No partial aligns selected: %s" % contig.id) #} end if continue #} end if # examine pairs of the chosen alignments if (self.params['use_quick_chooser']): #{ self.ExamineBestAlignsPairwise(contig) else: self.ExamineAlignGroupsPairwise(contig) #} end if #} end while DebugMsg(self, "-"*80) # close the contig sequences file if using the gap filter if (self.options.check_gap): #{ ctg_seq_file.close()
def Output(self, append): #{ # open the output file in the appropriate mode if append: #{ mode = "a" else: mode = "w" #} end if fail_msg = "Cannot open split alignment output file" out = FileBoxCls(self.paths['split_out'], mode, fail_msg) if (self.params['output_psl']): #{ if (self.candidate_contigs[0].align1.method == "blat"): #{ fail_msg = "Cannot open alignment psl output file" psl_out = FileBoxCls(self.paths['psl_out'], mode, fail_msg) DebugMsg(self, "Writing alignment lines to %s" % self.paths['psl_out']) # write out the alignment lines for the gapped alignment events found for psl_line in self.gapped_psl_lines: #{ psl_out.Write(psl_line) #} end for else: # only write out psl lines for blat alignments self.params['output_psl'] = False #} end if #} end if # write the split alignment details to the output file for candidate_contig in self.candidate_contigs: #{ # skip non-standard chromosomes #chr_patt = r"\A(chr)?(\d+|[XY]|MT?)\Z" #if (None == re.search(chr_patt, candidate_contig.align1.target) or # None == re.search(chr_patt, candidate_contig.align2.target)): #if (NonStandardChr(candidate_contig.align1.target) or # NonStandardChr(candidate_contig.align2.target)): #{ # DebugMsg(self, "Skipping non-standard chromosome: %s/%s" % # (candidate_contig.align1.target, candidate_contig.align2.target)) # continue #} end if #if ("chr" != candidate_contig.align1.target[0:3]): #{ # candidate_contig.align1.target = ("chr%s" % # candidate_contig.align1.target) #LogMsg(self, " Target: %s" % # candidate_contig.align1.target) #msg = ("Improperly formatted alignment: %s" % # candidate_contig.Details()) #raise CandidateIdentifierError(msg) #} end if #if ("chr" != candidate_contig.align2.target[0:3]): #{ # candidate_contig.align2.target = ("chr%s" % # candidate_contig.align2.target) #} end if candidate_contig.align1.target = AddChr(candidate_contig.align1.target) candidate_contig.align2.target = AddChr(candidate_contig.align2.target) ExtremeDebugMsg(self, "Writing line to %s:\n %s" % (out.path, candidate_contig.Details())) out.WriteLine(candidate_contig.Details()) if (self.params['output_psl']): #{ psl_out.Write(candidate_contig.align1.psl()) psl_out.Write(candidate_contig.align2.psl()) #} end if #} end for out.close() if (self.params['output_psl']): #{ psl_out.close()
class GeneAnnotationParserCls: #{ def __init__(self, path, type=None, log_info=None): #{ if (None == type): #{ type = GetAnnotationsType(path) #} end if if (type in PARSERS): #{ self.parser = PARSERS[type](path, log_info=log_info) self.file = None self.ParseLine = None elif (type in PARSE_FUNCTIONS): #{ self.parser = None self.file = FileBoxCls(path, "r", "cannot open %s annotations file" % type) self.ParseLine = PARSE_FUNCTIONS[type] else: raise GeneAnnotationError("cannot determine correct annotation parser " "from annotations type: %s" % type) #} end if self.log_info = log_info self.finished = False #} end def def __del__(self): #{ self.close() #} end def def __iter__(self): #{ #if (None == self.parser): #{ # return self #else: # return self.parser #} end if return self #} end def def next(self): #{ if (self.finished): #{ raise StopIteration #} end if #ExtremeDebugMsg(self, "Parsing annotation from file...") transcript = None try: if (None == self.parser): #{ #ExtremeDebugMsg(self, "Using ParseLine function...") line = self.file.next() transcript = self.ParseLine(line) else: #ExtremeDebugMsg(self, "Using internal parser...") transcript = self.parser.next() #} end if except StopIteration: self.finished = True #} end try if (None == transcript): #{ raise StopIteration #} end if transcript.gene_name = transcript.alias.replace(" ","_") transcript.transcript_id = transcript.name.replace(" ","_") #ExtremeDebugMsg(self, "Parsing transcript: %s (%s)" % # (transcript.gene_name, transcript.transcript_id)) return transcript #} end def def Close(self): #{ for attr in ["file", "parser"]: #{ if (hasattr(self, attr) and None != getattr(self, attr)): #{ getattr(self, attr).close() #} end if #} end for #} end def def close(self): #{ self.Close()
class P2GCalculatorCls: #{ def __init__(self, options): #{ SetupMainClass(self, options) CheckConfigCommands(self, "samtools") self.groups_file = None self.output_file = None self.options.use_chr = False #} end def def __del__(self): #{ # close input and output files, if they are not already closed self.CloseFiles() CloseLogFile(self) #} end def def CalculateSupport(self): #{ start = time.time() LogMsg(self, "Adding pair-to-genome support to groups...") # open the input and output files self.Setup() #ExtremeDebugMsg(self, "Should I use chr? %s" % self.options.use_chr) # for each group in the input file for group_line in self.groups_file: #{ group_start = time.time() # create a group object from the line group = P2GGroupCls(self.options, self.log_info) group.ParseGroupLine(group_line) LogMsg(self, "Group: %i" % group.group_id) ExtremeDebugMsg(self, " %s" % group.ToString()) # get the pair-to-genome support for the current group group.GetPairToGenomeSupport() # write the pair-to-genome support for the current group self.WritePairToGenomeSupport(group.SupportString()) ExtremeDebugMsg(self, "Time spent on group: %s" % TimeSpent(group_start)) #} end for # close the input and output files self.CloseFiles() # remove the temporary samtools output files for end in ["", "_1", "_2"]: #{ temp_sam_path = os.path.join(self.options.output_dir, "sam_out_tmp%s" % end) if (os.path.isfile(temp_sam_path)): #{ os.remove(temp_sam_path) #} end if temp_sam_path += ".err" if (os.path.isfile(temp_sam_path)): #{ os.remove(temp_sam_path) #} end if #} end for LogMsg(self, "Total time adding pair-to-genome support: %s" % TimeSpent(start)) #} end def def Setup(self): #{ fail_msg = "cannot open groups file" self.groups_file = FileBoxCls(self.options.barnacle_path, "r", fail_msg) output_file_path = self.options.barnacle_path.replace(".data", ".out") fail_msg = "cannot create pair-to-genome support output file" self.output_file = FileBoxCls(output_file_path, "w", fail_msg) # create samtools object and check whether to use "chr" in chromosome IDs samtools = SAMToolsCls(self.options.p2g_path, self.options, log_info=self.log_info) self.options.use_chr = samtools.ShouldChromUseChr() #} end def def WritePairToGenomeSupport(self, support_string): #{ self.output_file.WriteLine("%s" % support_string) #} end def def CloseFiles(self): #{ if (None != self.groups_file and not self.groups_file.closed): #{ self.groups_file.close() self.groups_file = None #} end if if (None != self.output_file and not self.output_file.closed): #{ self.output_file.close() self.output_file = None
def __init__(self, path, log_info=None): #{ self.log_info = log_info fail_msg = "cannot open read-to-contig support results file" self.file = FileBoxCls(path, "r", fail_msg) self.integrated = False self.curr_member = None
def __init__(self, input_path, log_info=None): #{ self.file = FileBoxCls(input_path, "r", "cannot read gene annotations input file") self.curr_feature = None self.log_info = log_info self.finished = False
def __init__(self, path, log_info=None): # { self.file = FileBoxCls(path, "r", "cannot read TopHat-Fusion results file") self.log_info = log_info
class EventPredictionCls: #{ def __init__(self, options): #{ SetupMainClass(self, options) if (not hasattr(self.options, "realign")): #{ self.options.realign = False #} end if if (self.options.realign): #{ CheckConfigCommands(self, "blat") #} end if self.predictors = dict() if (self.options.predict_fusions): #{ predictor = FusionPredictorCls(options, log_info=self.log_info) self.predictors[predictor.key] = predictor #} end if if (self.options.predict_ptds): #{ predictor = PTDPredictorCls(options, log_info=self.log_info) self.predictors[predictor.key] = predictor #} end if if (self.options.predict_itds): #{ predictor = ITDPredictorCls(options, log_info=self.log_info) self.predictors[predictor.key] = predictor #} end if self.use_chr = False #self.postpone_gene_check = False #} end def def __del__(self): #{ CloseLogFile(self) #} end def def PredictEvents(self): #{ LogMsg(self, "Predicting events...") start = time.time() # get the reference gene names, if a path is given #self.ref_gene_names = GetGeneNamesFromFile(self.options.gene_names_path, # self.log_info) group_parser = CandidateGroupParserCls(self.options.barnacle_path) # recheck breakpoint exons self.RecheckBreakpointExons(group_parser) realigner = None if (self.options.realign): #{ realigner = RealignerCls(self.options, self.log_info) #} end if # potential_events[bio_type][group_id] = event and gene sets object #potential_events = dict([(predictor.key, dict()) for # predictor in self.predictors]) LogMsg(self, "Processing candidate groups...") process_start = time.time() for group in group_parser: #{ # get the breakpoint exons for the group self.GetBreakpointExons(group) # check whether the event is any biologically typed event #self.CheckEvent(group, output_files, lib_info.lib_name, potential_events) # attempt to predict events of each specified type # from the current candidate group for predictor in self.predictors.itervalues(): #{ good_members = list() if (predictor.ProcessGroup(group, good_members) and None != realigner): #{ #realigner.UpdateContigs(group, good_members, predictor.store_seq) realigner.UpdateContigs(group, good_members, predictor.key) #} end if #} end for #} end for LogMsg(self, "Time spent processing candidate groups: %s" % TimeSpent(process_start)) if ("itd" in self.predictors and 0 < self.predictors["itd"].num_over_aligned): #{ LogMsg(self, "WARNING: %i gap candidates have aligned length greater " "than gap length!" % self.predictors["itd"].num_over_aligned) #} end if #if ('event_coords' in output_files): #{ # output_files['event_coords'].Close() # self.RecheckExonOverlap(output_files, potential_events, lib_info.lib_name) #} end if if (None != realigner and 0 < len(realigner.contigs)): #{ realigner.RealignContigs() LogMsg(self, "Before realignment:") for predictor in self.predictors.itervalues(): #{ LogMsg(self, " Number of %s predictions: %i" % (predictor.description, predictor.num_predictions)) if (0 == predictor.num_predictions): #{ continue #} end if if ("itd" in predictor.key or "fusion" in predictor.key): #{ predictor.LoadTranscriptSequences(realigner.contigs) #} end if predictor.ReprocessPredictions(realigner.contigs) #predictor.ReprocessPredictions(realigner.contigs, # realigner.contig_seqs) #} end for LogMsg(self, "%s\nAfter realignment:" % ("-"*40)) #} end if for predictor in self.predictors.itervalues(): #{ LogMsg(self, "Number of %s predictions: %i" % (predictor.description, predictor.num_predictions)) #} end for LogMsg(self, "Time spent predicting events: %s" % TimeSpent(start)) #} end def #def CreateOutputFiles(self, input_path): #{ # input_file_name = os.path.basename(input_path) # input_root = os.path.splitext(input_file_name)[0] # output_files = dict() # # setup the coordinates file for rechecking exon overlaps # self.SetupEventCoordsFile(input_root, output_files) # return output_files #} end def def RecheckBreakpointExons(self, group_parser): #{ if (None == self.options.breakpoint_exons): #{ self.overlaps_file = None return #} end if if (self.options.use_existing_group_coords): #{ LogMsg(self, "Using existing group coordinates file.") else: group_coords_file = self.CreateGroupCoordsFile() for group in group_parser: #{ #ExtremeDebugMsg(self, "Writing coordinates for group %i" % group.id) self.WriteGroupCoords(group, group_coords_file) #} end for group_parser.Close() group_coords_file.Close() #} end if self.CreateOverlapsFile() #} end def #def SetupEventCoordsFile(self, input_root, output_files): #{ def CreateGroupCoordsFile(self): #{ # check whether to use "chr" in chromosome names in coordinates file self.use_chr = ShouldChromUseChr(1, self.options.breakpoint_exons, "exon coordinates", self.log_info) # open the group coordinates file #output_files['event_coords'] = FileBoxCls(group_coords_path, "w", group_coords_file = FileBoxCls(self.options.group_coords_path, "w", "cannot create event coordinates file") #self.postpone_gene_check = True return group_coords_file #} end def #def WriteEventCoords(self, event, group_coords_file): #{ def WriteGroupCoords(self, event, group_coords_file): #{ for member in event.members: #{ if (member.gap): #{ # write gap event coordinates self.WriteGapGroupCoords(member, group_coords_file) else: # write split event coordinates self.WriteSplitGroupCoords(member, group_coords_file) #} end if #} end for #} end def def WriteGapGroupCoords(self, member, group_coords_file): #{ gap_coords = GroupCoordsCls( member.align_info_B.chrom, min(member.align_info_B.genome_start, member.align_info_B.genome_end), max(member.align_info_B.genome_start, member.align_info_B.genome_end), "%sA" % member.IDString(), self.use_chr ) group_coords_file.WriteLine("%s" % gap_coords.ToString()) #} end def def WriteSplitGroupCoords(self, member, group_coords_file): #{ split_coords_A = GroupCoordsCls( member.align_info_A.chrom, member.align_info_A.genome_end - self.options.event_buffer, member.align_info_A.genome_end + self.options.event_buffer, "%sA" % member.IDString(), self.use_chr ) group_coords_file.WriteLine("%s" % split_coords_A.ToString()) split_coords_B = GroupCoordsCls( member.align_info_B.chrom, member.align_info_B.genome_start - self.options.event_buffer, member.align_info_B.genome_start + self.options.event_buffer, "%sB" % member.IDString(), self.use_chr ) group_coords_file.WriteLine("%s" % split_coords_B.ToString()) #} end def #def RecheckExonOverlap(self, output_files, potential_events, lib_name): #{ # LogMsg(self, "Rechecking exon overlap...") # start = time.time() # # run overlap code # overlaps_path = self.RunOverlapCode(output_files['group_coords'].path) # try: #{ # # parse overlap code output # self.ParseOverlapResults(overlaps_path, potential_events) # except ACEventGroupError, e: # raise EventPredictionError("error parsing overlap file: %s" % e) # #} end try # self.ProcessPotentialEvents(potential_events, output_files, lib_name) # LogMsg(self, "Time spent rechecking exon overlaps: %s" % TimeSpent(start)) #} end def def CreateOverlapsFile(self): #{ if (self.options.use_existing_overlaps): #{ LogMsg(self, "Using existing breakpoint/transcript overlaps file.") else: LogMsg(self, "Running overlap code...") if (hasattr(self, "log_file") and None != self.log_file): #{ self.log_file.Flush() #} end if start = time.time() RunOverlapCode(self.options.breakpoint_exons, self.options.group_coords_path, self.options.overlaps_path, dpt=self.options.dpt) LogMsg(self, "Time spent running overlap code: %s" % TimeSpent(start)) #} end if self.overlaps_file = FileBoxCls(self.options.overlaps_path, "r", "cannot read exon/group overlaps file") #self.GetNextExonOverlap() #} end def def GetBreakpointExons(self, group): #{ if (not hasattr(self, "overlaps_file") or None == self.overlaps_file): #{ return #} end if ExtremeDebugMsg(self, "Getting breakpoint exons for group %i" % group.id) # clear any previous breakpoint genes group.ClearBPGenes() if (not hasattr(self, "curr_overlap")): #{ self.curr_overlap = None #} end if # skip overlaps for groups that come before the current group while (None == self.curr_overlap or self.curr_overlap.group_id < group.id): #{ try: #{ self.GetNextExonOverlap() except StopIteration: return #} end try #} end while # create a dictionary of the members of the current group members_dict = dict() for member in group.members: #{ members_dict[member.candidate_id] = member #} end for # get all overlaps for the current group while (self.curr_overlap.group_id == group.id): #{ if (self.curr_overlap.member_id in members_dict): #{ self.AddBreakPointGene(members_dict[self.curr_overlap.member_id]) #} end if try: #{ self.GetNextExonOverlap() except StopIteration: return #} end try #} end while #} end def def GetNextExonOverlap(self): #{ if (not hasattr(self, "overlaps_file") or None == self.overlaps_file): #{ ExtremeDebugMsg(self, "Setting current overlap to \"None\".") self.curr_overlap = None return #} end if overlap = ExonOverlapCls(self.overlaps_file.next()) self.curr_overlap = overlap ExtremeDebugMsg(self, "Current overlap = G%i%s r%s exons: %s" % (overlap.group_id, overlap.member_id, overlap.region_id, ",".join(overlap.exons))) #} end def def AddBreakPointGene(self, member): #{ if (None == self.curr_overlap): #{ return #} end if if (self.curr_overlap.group_id != member.group_id): #{ raise EventPredictionError("Group ID: %i does not match overlap ID: %i" % (member.group_id, self.curr_overlap.group_id)) #} end if if (self.curr_overlap.member_id != member.candidate_id): #{ raise EventPredictionError("Candidate ID: %s " % member.candidate_id + "does not match overlap ID: %s" % self.curr_overlap.member_id) #} end if member.AddGenes("breakpoint_%s" % self.curr_overlap.region_id, self.curr_overlap.exons)
class CandidateGroupParserCls: #{ def __init__(self, data_file_path, keep_lines=False, check_data=False): #{ CheckFilePath(data_file_path, "candidate group file") self.group_parser = GroupParserCls(keep_lines=keep_lines) self.check_data = check_data fail_message = "cannot open data file" self.data_file = FileBoxCls(data_file_path, "r", fail_message) self.groups = list() #} end def def __del__(self): #{ # close data file if it is open self.CloseDataFile() #} end def def __iter__(self): #{ return self #} end def # Load the entire data file into memory # Do not mix with using GetNextGroup() method def ParseDataFile(self): #{ #self.OpenDataFile() for group_line in self.data_file: #{ #group_line = CleanLine(group_line) # skip blank lines #if ("" == group_line): #{ # continue #} end if self.groups.append(self.group_parser.ParseGroup \ (group_line, self.data_file, check_data=self.check_data)) #} end for self.CloseDataFile() return self.groups #} end def # Load a single group from the data file into memory # Do not mix with using ParseDataFile() method def GetNextGroup(self): #{ return self.next() #} end def def next(self): #{ #if (None == self.data_file): #{ # self.OpenDataFile() #} end if group_line = "" # skip blank lines while ("" == group_line): #{ #group_line = CleanLine(self.data_file.next()) group_line = self.data_file.next() #} end if return self.group_parser.ParseGroup \ (group_line, self.data_file, check_data=self.check_data) #} end def def Close(self): #{ self.CloseDataFile() #} end def def CloseDataFile(self): #{ if (not hasattr(self, "data_file")): #{ return #} end if if (None == self.data_file): #{ return #} end if if (self.data_file.closed): #{ return #} end if self.data_file.close() #self.data_file = None #} end def def close(self): #{ self.CloseDataFile() #} end def def GroupLine(self): #{ if (not self.group_parser.keep_lines): #{ raise CandidateGroupParserError \ ("cannot get group line when keep_lines flag was not set") #} end if return self.group_parser.group_line #} end def def MemberLines(self): #{ if (not self.group_parser.keep_lines): #{ raise CandidateGroupParserError \ ("cannot get member lines when keep_lines flag was not set") #} end if return self.group_parser.member_lines