class LibIteratorCls: def __init__(self, lib_list_path, ProcessLibraryMethod, options, log_info=None): self.lib_list_path = lib_list_path self.ProcessLibrary = ProcessLibraryMethod self.options = options self.CheckOptions() #if (not hasattr(options, "no_path_check")): # self.options.no_path_check = False # end if self.log_info = log_info self.num_libs = 0 self.list_of_paths = False # end def def __del__(self): if (hasattr(self, "lib_list_file") and None != self.lib_list_file and not self.lib_list_file.closed): self.lib_list_file.close() # end if # end def def CheckOptions(self): #{ required_opts = ["no_path_check", "only_pass"] for opt in required_opts: #{ if (not hasattr(self.options, opt)): #{ setattr(self.options, opt, False) #} end if #} end for if (not hasattr(self.options, "get_paths")): self.options.get_paths = True #} end if #} end def def IterateOverAllLibs(self): self.num_libs = 0 self.lib_list_file = FileBoxCls(self.lib_list_path, "r", "could not open library list file") for lib_line in self.lib_list_file: # skip comment lines if (lib_line.startswith("#")): continue # end if lib_info = LibraryInfoCls(self.options, self.log_info, self.list_of_paths) lib_info.GetLibDir(lib_line) DebugMsg(self, "Lib Dir: %s" % lib_info.lib_dir) if (self.options.get_paths and not self.list_of_paths): lib_info.GetEventPaths() # end if if (1 > len(lib_info.event_paths) and (self.options.get_paths or self.list_of_paths)): raise LibIteratorError("could not get event path(s) from directory: " "%s" % lib_info.lib_dir) # end if self.ProcessLibrary(lib_info) self.num_libs += 1 # end for self.lib_list_file.close()
def PrintErrors(self): # { errors = False fail_msg = "could not open samtools error file" err_file = FileBoxCls(self.err_file_path, "r", fail_msg) for line in err_file: # { LogMsg(self, line) errors = True # } end for err_file.close() return errors
def PrintFilters(self): #{ filter_file = FileBoxCls(self.OutputPath("filters"), "w", "could not open filter file") try: #{ for filter_name in sorted(self.filters.keys()): #{ #LogMsg(self, "%s: %s" % # (self.filters[filter_name].description, # self.filters[filter_name].ValueString())) filter_file.WriteLine("%s: %s" % (self.filters[filter_name].description, self.filters[filter_name].ValueString())) #} end for finally: filter_file.close()
def CheckStatus(self): #{ fail_msg = \ "cannot open output file for job number %s" % self.num output_file = FileBoxCls(self.output_path, "r", fail_msg) self.status = "in progress" for output_line in output_file: #{ if (R2C_SUCCESS == output_line): #{ self.status = "complete" break elif ("" == output_line or R2C_FAIL == output_line): #{ self.status = "failed" break #} end if #} end for output_file.close() return
def CreateAlignCoordsFile(self, aligns): #{ DebugMsg(self, "Creating new alignment coordinates file...") # open the alignment coordinates file fail_msg = "Cannot open alignment coordinates file" align_coords_file = FileBoxCls(self.paths['align_coords'], "w", fail_msg) # iterate through the alignments for id, align in enumerate(aligns): #{ align = FixAlign(align) # REMINDER: use alignment blocks instead! WriteBlockCoords(align, id, align_coords_file, use_chr=True) #coord_str = "%s %i %i %i" % (align.target, # min(align.tstart, align.tend), max(align.tstart, align.tend), # id) #align_coords_file.write(coord_str + "\n") #} end for align_coords_file.close()
def CheckStatus(self): #{ ExtremeDebugMsg(self, "Checking job status: %s" % self.output_path) fail_msg = ("cannot open output file for job number %s" % self.num) output_file = FileBoxCls(self.output_path, "r", fail_msg) self.status = "in progress" for output_line in output_file: #{ ExtremeDebugMsg(self, " %s" % output_line) if (CID_SUCCESS == output_line): #{ self.status = "complete" break elif ("" == output_line or CID_FAIL == output_line): #{ self.status = "failed" break #} end if #} end for output_file.close() return
def IntegrateP2GFile(self, p2g_path): #{ DebugMsg(self, "Integrating pair-to-genome file: %s" % p2g_path) group = None fail_msg = "cannot open pair-to-genome results file" p2g_file = FileBoxCls(p2g_path, "r", fail_msg) for p2g_line in p2g_file: #{ DebugMsg(self, "LINE: %s" % p2g_line) # count the group self.num_groups += 1 # parse the pair-to-genome line p2g_support = P2GGroupCls(self.options, self.log_info) p2g_support.ParseSupportString(p2g_line) # check that the group had some reads at least if (1 > p2g_support.num_reads): #{ self.groups_without_reads.append("%i" % p2g_support.group_id) #} end if # get a group from the groups file if (None == group or p2g_support.group_id > group.id): try: #{ DebugMsg(self, "Getting next group...") group = self.group_parser.GetNextGroup() except StopIteration: raise P2GIntegratorError \ ("Unexpected end of groups file: %s\n while integrating: %s" % (self.group_parser.data_file_path, p2g_path)) #} end try # allow for groups having been removed from the groups file if (p2g_support.group_id < group.id): #{ continue #} end if # ensure that the group ids match up if (p2g_support.group_id != group.id): #{ raise P2GIntegratorError("Inconsistent group ids: %i from %s, " % (p2g_support.group_id, p2g_path) + "%i from %s" % (group.id, self.options.barnacle_path)) #} end if # add the pair-to-genome support to the group self.AddSupportToGroup(group, p2g_support) # apply any pair-to-genome filters given #self.ApplyFilters(group) # write the group to the new output file(s) self.WriteGroup(group) #} end for p2g_file.close()
def WriteCounts(self): #{ # open the counts file fail_msg = "Cannot open counts file" counts_file = FileBoxCls(self.paths['counts'], "w", fail_msg) # write the number of split alignments found counts_file.WriteLine("Split: %i" % len(self.candidate_contigs)) # if gapped alignments were also checked for if (self.options.check_gap): #{ # write the number of gapped alignments found msg = "Gapped: " if (self.more_than_99): #{ msg += "at least " #} end if msg += "%i" % self.num_gapped_aligns counts_file.WriteLine(msg) #} end if counts_file.WriteLine("COMPLETE") # close the counts file counts_file.close()
class TopHatFileCls: # { def __init__(self, path, log_info=None): # { self.file = FileBoxCls(path, "r", "cannot read TopHat-Fusion results file") self.log_info = log_info # } end def def __del__(self): # { self.close() # } end def def __iter__(self): # { return self # } end def def next(self): # { # the first line should start with "allAtOnce" and # it contains the breakpoint coordinates # parse the tophat line tophat_event = TopHatEventCls(self.file.next()) # the next two lines should be "sequence" lines tophat_event.CheckSeqLine(self.file.next()) tophat_event.CheckSeqLine(self.file.next()) # the next lines should be... scores? tophat_event.CheckScoreLine(self.file.next()) # the next line should have the gene ids tophat_event.ParseGenesLine(self.file.next()) # skip the final line self.file.next() return tophat_event # } end def def close(self): # { if hasattr(self, "file") and None != self.file and not self.file.closed: # { self.file.close()
class P2GCalculatorCls: #{ def __init__(self, options): #{ SetupMainClass(self, options) CheckConfigCommands(self, "samtools") self.groups_file = None self.output_file = None self.options.use_chr = False #} end def def __del__(self): #{ # close input and output files, if they are not already closed self.CloseFiles() CloseLogFile(self) #} end def def CalculateSupport(self): #{ start = time.time() LogMsg(self, "Adding pair-to-genome support to groups...") # open the input and output files self.Setup() #ExtremeDebugMsg(self, "Should I use chr? %s" % self.options.use_chr) # for each group in the input file for group_line in self.groups_file: #{ group_start = time.time() # create a group object from the line group = P2GGroupCls(self.options, self.log_info) group.ParseGroupLine(group_line) LogMsg(self, "Group: %i" % group.group_id) ExtremeDebugMsg(self, " %s" % group.ToString()) # get the pair-to-genome support for the current group group.GetPairToGenomeSupport() # write the pair-to-genome support for the current group self.WritePairToGenomeSupport(group.SupportString()) ExtremeDebugMsg(self, "Time spent on group: %s" % TimeSpent(group_start)) #} end for # close the input and output files self.CloseFiles() # remove the temporary samtools output files for end in ["", "_1", "_2"]: #{ temp_sam_path = os.path.join(self.options.output_dir, "sam_out_tmp%s" % end) if (os.path.isfile(temp_sam_path)): #{ os.remove(temp_sam_path) #} end if temp_sam_path += ".err" if (os.path.isfile(temp_sam_path)): #{ os.remove(temp_sam_path) #} end if #} end for LogMsg(self, "Total time adding pair-to-genome support: %s" % TimeSpent(start)) #} end def def Setup(self): #{ fail_msg = "cannot open groups file" self.groups_file = FileBoxCls(self.options.barnacle_path, "r", fail_msg) output_file_path = self.options.barnacle_path.replace(".data", ".out") fail_msg = "cannot create pair-to-genome support output file" self.output_file = FileBoxCls(output_file_path, "w", fail_msg) # create samtools object and check whether to use "chr" in chromosome IDs samtools = SAMToolsCls(self.options.p2g_path, self.options, log_info=self.log_info) self.options.use_chr = samtools.ShouldChromUseChr() #} end def def WritePairToGenomeSupport(self, support_string): #{ self.output_file.WriteLine("%s" % support_string) #} end def def CloseFiles(self): #{ if (None != self.groups_file and not self.groups_file.closed): #{ self.groups_file.close() self.groups_file = None #} end if if (None != self.output_file and not self.output_file.closed): #{ self.output_file.close() self.output_file = None
class CandidateGroupParserCls: #{ def __init__(self, data_file_path, keep_lines=False, check_data=False): #{ CheckFilePath(data_file_path, "candidate group file") self.group_parser = GroupParserCls(keep_lines=keep_lines) self.check_data = check_data fail_message = "cannot open data file" self.data_file = FileBoxCls(data_file_path, "r", fail_message) self.groups = list() #} end def def __del__(self): #{ # close data file if it is open self.CloseDataFile() #} end def def __iter__(self): #{ return self #} end def # Load the entire data file into memory # Do not mix with using GetNextGroup() method def ParseDataFile(self): #{ #self.OpenDataFile() for group_line in self.data_file: #{ #group_line = CleanLine(group_line) # skip blank lines #if ("" == group_line): #{ # continue #} end if self.groups.append(self.group_parser.ParseGroup \ (group_line, self.data_file, check_data=self.check_data)) #} end for self.CloseDataFile() return self.groups #} end def # Load a single group from the data file into memory # Do not mix with using ParseDataFile() method def GetNextGroup(self): #{ return self.next() #} end def def next(self): #{ #if (None == self.data_file): #{ # self.OpenDataFile() #} end if group_line = "" # skip blank lines while ("" == group_line): #{ #group_line = CleanLine(self.data_file.next()) group_line = self.data_file.next() #} end if return self.group_parser.ParseGroup \ (group_line, self.data_file, check_data=self.check_data) #} end def def Close(self): #{ self.CloseDataFile() #} end def def CloseDataFile(self): #{ if (not hasattr(self, "data_file")): #{ return #} end if if (None == self.data_file): #{ return #} end if if (self.data_file.closed): #{ return #} end if self.data_file.close() #self.data_file = None #} end def def close(self): #{ self.CloseDataFile() #} end def def GroupLine(self): #{ if (not self.group_parser.keep_lines): #{ raise CandidateGroupParserError \ ("cannot get group line when keep_lines flag was not set") #} end if return self.group_parser.group_line #} end def def MemberLines(self): #{ if (not self.group_parser.keep_lines): #{ raise CandidateGroupParserError \ ("cannot get member lines when keep_lines flag was not set") #} end if return self.group_parser.member_lines
def Output(self, append): #{ # open the output file in the appropriate mode if append: #{ mode = "a" else: mode = "w" #} end if fail_msg = "Cannot open split alignment output file" out = FileBoxCls(self.paths['split_out'], mode, fail_msg) if (self.params['output_psl']): #{ if (self.candidate_contigs[0].align1.method == "blat"): #{ fail_msg = "Cannot open alignment psl output file" psl_out = FileBoxCls(self.paths['psl_out'], mode, fail_msg) DebugMsg(self, "Writing alignment lines to %s" % self.paths['psl_out']) # write out the alignment lines for the gapped alignment events found for psl_line in self.gapped_psl_lines: #{ psl_out.Write(psl_line) #} end for else: # only write out psl lines for blat alignments self.params['output_psl'] = False #} end if #} end if # write the split alignment details to the output file for candidate_contig in self.candidate_contigs: #{ # skip non-standard chromosomes #chr_patt = r"\A(chr)?(\d+|[XY]|MT?)\Z" #if (None == re.search(chr_patt, candidate_contig.align1.target) or # None == re.search(chr_patt, candidate_contig.align2.target)): #if (NonStandardChr(candidate_contig.align1.target) or # NonStandardChr(candidate_contig.align2.target)): #{ # DebugMsg(self, "Skipping non-standard chromosome: %s/%s" % # (candidate_contig.align1.target, candidate_contig.align2.target)) # continue #} end if #if ("chr" != candidate_contig.align1.target[0:3]): #{ # candidate_contig.align1.target = ("chr%s" % # candidate_contig.align1.target) #LogMsg(self, " Target: %s" % # candidate_contig.align1.target) #msg = ("Improperly formatted alignment: %s" % # candidate_contig.Details()) #raise CandidateIdentifierError(msg) #} end if #if ("chr" != candidate_contig.align2.target[0:3]): #{ # candidate_contig.align2.target = ("chr%s" % # candidate_contig.align2.target) #} end if candidate_contig.align1.target = AddChr(candidate_contig.align1.target) candidate_contig.align2.target = AddChr(candidate_contig.align2.target) ExtremeDebugMsg(self, "Writing line to %s:\n %s" % (out.path, candidate_contig.Details())) out.WriteLine(candidate_contig.Details()) if (self.params['output_psl']): #{ psl_out.Write(candidate_contig.align1.psl()) psl_out.Write(candidate_contig.align2.psl()) #} end if #} end for out.close() if (self.params['output_psl']): #{ psl_out.close()
def IdentifyCandidateContigs(self, aligns): #{ # TEMP # ExtremeDebugMsg(self, AlignListString(aligns) # open the contig sequences file if using the gap filter if (self.options.check_gap): #{ fail_msg = "Cannot open contig sequence file" ctg_seq_file = FileBoxCls(self.paths['ctg_seq'], "r", fail_msg) else: ctg_seq_file = None #} end if # iterate over the alignments, grouping them by query (i.e. contig) contig_align_index = 0 while (contig_align_index < len(aligns)): #{ self.num_contigs += 1 contig = ContigWithAlignmentsCls(contig_align_index, aligns, ctg_seq_file, self.paths['gap_out'], self.options, self.log_info) ExtremeDebugMsg(self, "-"*80) #DebugMsg(self, "Grouping alignments for " # "%s (contig #%i)..." % (contig.id, self.num_contigs)) DebugMsg(self, "%i) %s" % (self.num_contigs, contig.id)) ExtremeDebugMsg(self, " Contig length: %i" % contig.length) #LogMsg(self, "Contig align index: %i" % contig_align_index) # Select the alignments to consider for the current contig # and check for gapped alignments at the same time contig.SelectAlignments(aligns) if (contig.single_align_found): #{ self.num_full_aligns += 1 #} end if if (self.options.check_gap and not contig.perfect_align_found): #{ contig.CheckGappedAlignments() self.gapped_psl_lines.extend(contig.gapped_psl_lines) self.num_gapped_aligns += contig.num_gapped_aligns #} end if if (self.params['check_split'] and not self.params['use_quick_chooser']): # pare down the alignment groups so that # only the best alignments remain contig.PareAlignmentGroups() #} end if #LogMsg(self, "# Gaps Found (Finder): %i" % # self.num_gapped_aligns) contig_align_index += contig.num_aligns_to_contig if (0 < len(contig.best_aligns)): #{ if (self.log_info['debug']): #{ LogMsg(self, "%i best aligns: %s" % (len(contig.best_aligns), contig.id)) ExtremeDebugMsg(self, AlignListString(contig.best_aligns)) #} end if elif (0 < len(contig.align_groups)): #{ if (self.log_info['debug']): #{ ExtremeDebugMsg(self, "-"*40) LogMsg(self, "%i align groups: %s" % (len(contig.align_groups), contig.id)) for i, group in enumerate(contig.align_groups): #{ ExtremeDebugMsg(self, "\n".join(["Group %i" % i, " %i) S:%i E:%i Aligns:%i" % (i, group.ctg_start, group.ctg_end, len(group.best_aligns)), AlignListString(group.best_aligns)])) #} end for #} end if else: # no best aligns or align groups found if (not contig.perfect_align_found and not contig.single_align_found): #{ DebugMsg(self, "No partial aligns selected: %s" % contig.id) #} end if continue #} end if # examine pairs of the chosen alignments if (self.params['use_quick_chooser']): #{ self.ExamineBestAlignsPairwise(contig) else: self.ExamineAlignGroupsPairwise(contig) #} end if #} end while DebugMsg(self, "-"*80) # close the contig sequences file if using the gap filter if (self.options.check_gap): #{ ctg_seq_file.close()