コード例 #1
0
ファイル: library_iterator.py プロジェクト: ptraverse/gsc
class LibIteratorCls:
  def __init__(self, lib_list_path, ProcessLibraryMethod,
      options, log_info=None):
    self.lib_list_path = lib_list_path
    self.ProcessLibrary = ProcessLibraryMethod
    self.options = options
    self.CheckOptions()
    #if (not hasattr(options, "no_path_check")):
    #  self.options.no_path_check = False
    # end if
    self.log_info = log_info
    self.num_libs = 0
    self.list_of_paths = False
  # end def

  def __del__(self):
    if (hasattr(self, "lib_list_file") and
        None != self.lib_list_file     and
        not self.lib_list_file.closed):
      self.lib_list_file.close()
    # end if
  # end def

  def CheckOptions(self): #{
    required_opts = ["no_path_check", "only_pass"]
    for opt in required_opts: #{
      if (not hasattr(self.options, opt)): #{
        setattr(self.options, opt, False)
      #} end if
    #} end for
    if (not hasattr(self.options, "get_paths")):
      self.options.get_paths = True
    #} end if
  #} end def

  def IterateOverAllLibs(self):
    self.num_libs = 0
    self.lib_list_file = FileBoxCls(self.lib_list_path, "r",
      "could not open library list file")
    for lib_line in self.lib_list_file:
      # skip comment lines
      if (lib_line.startswith("#")):
        continue
      # end if
      lib_info = LibraryInfoCls(self.options, self.log_info,
        self.list_of_paths)
      lib_info.GetLibDir(lib_line)
      DebugMsg(self, "Lib Dir: %s" % lib_info.lib_dir)
      if (self.options.get_paths and not self.list_of_paths):
        lib_info.GetEventPaths()
      # end if
      if (1 > len(lib_info.event_paths) and
          (self.options.get_paths or self.list_of_paths)):
        raise LibIteratorError("could not get event path(s) from directory: "
          "%s" % lib_info.lib_dir)
      # end if
      self.ProcessLibrary(lib_info)
      self.num_libs += 1
    # end for
    self.lib_list_file.close()
コード例 #2
0
ファイル: fasta.py プロジェクト: ptraverse/gsc
class FastaFileCls: #{
  def __init__(self, path, fail_msg="cannot open fasta file",
      log_info=None, line_delim="", maintain_case=False): #{
    self.file = FileBoxCls(path, "r", fail_msg)
    self.line_delim = line_delim
    self.curr_line = None
    self.log_info = log_info
    self.finished = False
    self.maintain_case = maintain_case
  #} end def

  def __del__(self): #{
    self.file.Close()
  #} end def

  def __iter__(self): #{
    return self
  #} end def

  def next(self): #{
    if (self.finished): #{
      raise StopIteration
    #} end if
    new_seq = None
    try: #{
      if (None == self.curr_line): #{
        self.curr_line = self.file.next()
      #} end if
      if (not self.curr_line.startswith(">")): #{
        raise FastaError("improperly formatted fasta file: sequence id line "
          "must begin with \">\": \"%s\"." % self.curr_line)
      #} end if
      if (" " in self.curr_line): #{
        (seq_id, seq_extra) = self.curr_line.lstrip(">").split(" ", 1)
      else:
        seq_id = self.curr_line.lstrip(">")
        seq_extra = None
      #} end if
      new_seq = SequenceCls(seq_id, seq_extra)
      self.curr_line = self.file.next()
      while (not self.curr_line.startswith(">")): #{
        if ("" != new_seq.sequence): #{
          new_seq.sequence += self.line_delim
        #} end if
        new_seq.sequence += self.curr_line
        try: #{
          self.curr_line = self.file.next()
        except StopIteration:
          self.finished = True
          break
        #} end try
      #} end while
      if (not self.maintain_case): #{
        new_seq.sequence = new_seq.sequence.upper()
      #} end if
      return new_seq
    except StopIteration, e:
      self.finished = True
      raise e
コード例 #3
0
ファイル: samtools.py プロジェクト: ptraverse/gsc
 def PrintErrors(self):  # {
     errors = False
     fail_msg = "could not open samtools error file"
     err_file = FileBoxCls(self.err_file_path, "r", fail_msg)
     for line in err_file:  # {
         LogMsg(self, line)
         errors = True
     # } end for
     err_file.close()
     return errors
コード例 #4
0
ファイル: calculate.py プロジェクト: ptraverse/gsc
 def Setup(self): #{
   fail_msg = "cannot open groups file"
   self.groups_file = FileBoxCls(self.options.barnacle_path, "r", fail_msg)
   output_file_path = self.options.barnacle_path.replace(".data", ".out")
   fail_msg = "cannot create pair-to-genome support output file"
   self.output_file = FileBoxCls(output_file_path, "w", fail_msg)
   # create samtools object and check whether to use "chr" in chromosome IDs
   samtools = SAMToolsCls(self.options.p2g_path, self.options,
     log_info=self.log_info)
   self.options.use_chr = samtools.ShouldChromUseChr()
コード例 #5
0
ファイル: realigner.py プロジェクト: ptraverse/gsc
 def CreateQueryFile(self): #{
   LogMsg(self, "Creating query file...")
   query_file = FileBoxCls(self.query_path, "w", "cannot create query "
     "contig sequences file")
   all_contigs_file = FileBoxCls(self.options.ctg_seq_path, "r",
     "cannot read contig sequences file")
   seqs_found = False
   num_written = 0
   for id_line in all_contigs_file: #{
     seq_line = all_contigs_file.next()
     if (not id_line.startswith(">")): #{
       raise RealignerError("invalid contig id line in sequece file:\n%s" %
         id_line)
     #} end if
     # extract the contig id from the line
     ctg_id = id_line.lstrip(">").split()[0]
     #DebugMsg(self, "Contig ID from sequence file: %s" % ctg_id)
     # if the contig is represented in one of the potential predictions
     if (ctg_id in self.contigs): #{
       #DebugMsg(self, "Writing sequence to query file.")
       # write it to the query file
       query_file.WriteLine(id_line)
       query_file.WriteLine(seq_line)
       seqs_found = True
       num_written += 1
       self.contigs[ctg_id].written = True
       self.contigs[ctg_id].sequence = seq_line.lower()
       #if ("itd" in self.contigs[ctg_id].types): #{
       #  self.contigs[ctg_id].sequence = seq_line.lower()
       #} end if
       self.missing.discard(ctg_id)
     #} end if
     #if (ctg_id in self.contig_seqs): #{
     #  self.contig_seqs[ctg_id] = seq_line
     #} end if
   #} for
   if (not seqs_found): #{
     raise RealignerError("could not find any contig sequences in %s" %
       self.options.ctg_seq_path)
   #} end if
   if (num_written != len(self.contigs)): #{
     #missed = list()
     #for contig in self.contigs.itervalues(): #{
     #  if (not contig.written): #{
     #    missed.append(contig.id)
     #  #} end if
     #} end for
     LogMsg(self, "WARNING: only wrote %i of %i contig sequences! " %
       (num_written, len(self.contigs)) + "Missing: %s" %
       ",".join(sorted(self.missing)))
     #  ",".join(missed))
   #} end if
   all_contigs_file.Close()
   query_file.Close()
コード例 #6
0
ファイル: filter_groups.py プロジェクト: ptraverse/gsc
 def PrintFilters(self): #{
   filter_file = FileBoxCls(self.OutputPath("filters"), "w",
     "could not open filter file")
   try: #{
     for filter_name in sorted(self.filters.keys()): #{
       #LogMsg(self, "%s: %s" %
       #  (self.filters[filter_name].description,
       #   self.filters[filter_name].ValueString()))
       filter_file.WriteLine("%s: %s" %
         (self.filters[filter_name].description,
          self.filters[filter_name].ValueString()))
     #} end for
   finally:
     filter_file.close()
コード例 #7
0
 def __init__(self, data_file_path, keep_lines=False, check_data=False): #{
   CheckFilePath(data_file_path, "candidate group file")
   self.group_parser = GroupParserCls(keep_lines=keep_lines)
   self.check_data = check_data
   fail_message = "cannot open data file"
   self.data_file = FileBoxCls(data_file_path, "r", fail_message)
   self.groups = list()
コード例 #8
0
ファイル: check_status.py プロジェクト: ptraverse/gsc
 def CheckStatus(self): #{
   fail_msg = \
     "cannot open output file for job number %s" % self.num
   output_file = FileBoxCls(self.output_path, "r", fail_msg)
   self.status = "in progress"
   for output_line in output_file: #{
     if (R2C_SUCCESS == output_line): #{
       self.status = "complete"
       break
     elif ("" == output_line or R2C_FAIL == output_line): #{
       self.status = "failed"
       break
     #} end if
   #} end for
   output_file.close()
   return
コード例 #9
0
ファイル: library_iterator.py プロジェクト: ptraverse/gsc
 def IterateOverAllLibs(self):
   self.num_libs = 0
   self.lib_list_file = FileBoxCls(self.lib_list_path, "r",
     "could not open library list file")
   for lib_line in self.lib_list_file:
     # skip comment lines
     if (lib_line.startswith("#")):
       continue
     # end if
     lib_info = LibraryInfoCls(self.options, self.log_info,
       self.list_of_paths)
     lib_info.GetLibDir(lib_line)
     DebugMsg(self, "Lib Dir: %s" % lib_info.lib_dir)
     if (self.options.get_paths and not self.list_of_paths):
       lib_info.GetEventPaths()
     # end if
     if (1 > len(lib_info.event_paths) and
         (self.options.get_paths or self.list_of_paths)):
       raise LibIteratorError("could not get event path(s) from directory: "
         "%s" % lib_info.lib_dir)
     # end if
     self.ProcessLibrary(lib_info)
     self.num_libs += 1
   # end for
   self.lib_list_file.close()
コード例 #10
0
 def CreateAlignCoordsFile(self, aligns): #{
   DebugMsg(self, "Creating new alignment coordinates file...")
   # open the alignment coordinates file
   fail_msg = "Cannot open alignment coordinates file"
   align_coords_file = FileBoxCls(self.paths['align_coords'], "w", fail_msg)
   # iterate through the alignments
   for id, align in enumerate(aligns): #{
     align = FixAlign(align)
     # REMINDER: use alignment blocks instead!
     WriteBlockCoords(align, id, align_coords_file, use_chr=True)
     #coord_str = "%s %i %i %i" % (align.target,
     #  min(align.tstart, align.tend), max(align.tstart, align.tend),
     #  id)
     #align_coords_file.write(coord_str + "\n")
   #} end for
   align_coords_file.close()
コード例 #11
0
ファイル: check_status.py プロジェクト: ptraverse/gsc
 def CheckStatus(self): #{
   ExtremeDebugMsg(self, "Checking job status: %s" % self.output_path)
   fail_msg = ("cannot open output file for job number %s" % self.num)
   output_file = FileBoxCls(self.output_path, "r", fail_msg)
   self.status = "in progress"
   for output_line in output_file: #{
     ExtremeDebugMsg(self, "  %s" % output_line)
     if (CID_SUCCESS == output_line): #{
       self.status = "complete"
       break
     elif ("" == output_line or CID_FAIL == output_line): #{
       self.status = "failed"
       break
     #} end if
   #} end for
   output_file.close()
   return
コード例 #12
0
ファイル: fasta.py プロジェクト: ptraverse/gsc
 def __init__(self, path, fail_msg="cannot open fasta file",
     log_info=None, line_delim="", maintain_case=False): #{
   self.file = FileBoxCls(path, "r", fail_msg)
   self.line_delim = line_delim
   self.curr_line = None
   self.log_info = log_info
   self.finished = False
   self.maintain_case = maintain_case
コード例 #13
0
ファイル: integrate.py プロジェクト: ptraverse/gsc
 def IntegrateP2GFile(self, p2g_path): #{
   DebugMsg(self, "Integrating pair-to-genome file: %s" % p2g_path)
   group = None
   fail_msg = "cannot open pair-to-genome results file"
   p2g_file = FileBoxCls(p2g_path, "r", fail_msg)
   for p2g_line in p2g_file: #{
     DebugMsg(self, "LINE: %s" % p2g_line)
     # count the group
     self.num_groups += 1
     # parse the pair-to-genome line
     p2g_support = P2GGroupCls(self.options, self.log_info)
     p2g_support.ParseSupportString(p2g_line)
     # check that the group had some reads at least
     if (1 > p2g_support.num_reads): #{
       self.groups_without_reads.append("%i" % p2g_support.group_id)
     #} end if
     # get a group from the groups file
     if (None == group or
         p2g_support.group_id > group.id):
       try: #{
         DebugMsg(self, "Getting next group...")
         group = self.group_parser.GetNextGroup()
       except StopIteration:
         raise P2GIntegratorError \
           ("Unexpected end of groups file: %s\n  while integrating: %s" %
            (self.group_parser.data_file_path, p2g_path))
       #} end try
     # allow for groups having been removed from the groups file
     if (p2g_support.group_id < group.id): #{
       continue
     #} end if
     # ensure that the group ids match up
     if (p2g_support.group_id != group.id): #{
       raise P2GIntegratorError("Inconsistent group ids: %i from %s, " %
         (p2g_support.group_id, p2g_path) +
         "%i from %s" % (group.id, self.options.barnacle_path))
     #} end if
     # add the pair-to-genome support to the group
     self.AddSupportToGroup(group, p2g_support)
     # apply any pair-to-genome filters given
     #self.ApplyFilters(group)
     # write the group to the new output file(s)
     self.WriteGroup(group)
   #} end for
   p2g_file.close()
コード例 #14
0
 def WriteCounts(self): #{
   # open the counts file
   fail_msg = "Cannot open counts file"
   counts_file = FileBoxCls(self.paths['counts'], "w", fail_msg)
   # write the number of split alignments found
   counts_file.WriteLine("Split: %i" % len(self.candidate_contigs))
   # if gapped alignments were also checked for
   if (self.options.check_gap): #{
     # write the number of gapped alignments found
     msg = "Gapped: "
     if (self.more_than_99): #{
       msg += "at least "
     #} end if
     msg += "%i" % self.num_gapped_aligns
     counts_file.WriteLine(msg)
   #} end if
   counts_file.WriteLine("COMPLETE")
   # close the counts file
   counts_file.close()
コード例 #15
0
ファイル: integrate.py プロジェクト: ptraverse/gsc
class R2CResultsFileCls: #{
  def __init__(self, path, log_info=None): #{
    self.log_info = log_info
    fail_msg = "cannot open read-to-contig support results file"
    self.file = FileBoxCls(path, "r", fail_msg)
    self.integrated = False
    self.curr_member = None
  #} end def

  def __del__(self): #{
    self.file.Close()
  #} end def

  def BeforeGroup(self, group_id): #{
    if (self.integrated): #{
      return False
    #} end if
    if (None == self.curr_member or self.curr_member.group_id < group_id): #{
      return True
    #} end if
    return False
  #} end def

  def GroupIsCurrent(self, group_id): #{
    if (self.integrated or None == self.curr_member): #{
      return False
    #} end if
    if (self.curr_member.group_id == group_id): #{
      return True
    #} end if
    return False
  #} end def

  def GetMember(self): #{
    if (self.integrated): #{
      DebugMsg(self, "Not getting member, file already fully integrated.")
      return
    #} end if
    DebugMsg(self, "Getting member...")
    try: #{
      member_line = self.file.next()
      # create a new member from the current line, store it as "curr_member"
      (member_id, support_list) = member_line.split(" ")
      self.curr_member = R2CMemberCls(member_id, log_info=self.log_info)
      # store support values
      self.curr_member.InitializeSupport(support_list)
      DebugMsg(self, "New member: %s" % self.curr_member.DebugString())
    except StopIteration:
      DebugMsg(self, "Integrated all support from %s" % self.file.path)
      self.curr_member = None
      self.integrated = True
      self.file.Close()
      return
コード例 #16
0
ファイル: read_simulator.py プロジェクト: ptraverse/gsc
 def GenerateEventReads(self):  # {
     LogMsg(self, "Generating event reads...")
     start = time.time()
     seq_file = FastaFileCls(self.options.eseq_path, "cannot read sequences file")
     npairs_file = FileBoxCls(self.options.enreads_path, "w", "cannot create event read counts file")
     cov_file = FileBoxCls(self.options.ecov_path, "r", "cannot read event coverages file")
     # number of sequences from which reads were actually simulated
     nseqs_sim = 0
     for seq_obj in seq_file:  # {
         if len(seq_obj) <= self.options.frag_length:  # {
             LogMsg(
                 self,
                 "Sequence %s shorter than fragment length: "
                 "%i < %i" % (seq_obj.id, len(seq_obj), self.options.frag_length),
             )
             continue
         # } end if
         nseqs_sim += 1
         seq_obj.covered = False
         while not seq_obj.covered:  # {
             cov_line = cov_file.next()
             # coverage = float(cov_line) + self.options.cov_adjust
             coverage = float(cov_line)
             nreads = coverage * (float(len(seq_obj)) / float(self.options.read_length))
             npairs = IntFloor(float(nreads) / 2.0)
             if 1 > npairs:  # {
                 ExtremeDebugMsg(self, "    coverage %.3f too low, no reads." % coverage)
                 continue
             # } end if
             # coverage = nreads * self.options.read_length / len(seq_obj)
             self.SimulateReads(seq_obj, npairs, "e")
         # } end while
         npairs_file.WriteLine("%s %i %f" % (seq_obj.id, npairs, coverage))
     # } end for
     cov_file.Close()
     npairs_file.Close()
     seq_file.Close()
     LogMsg(self, "Simulated reads from %i event sequences" % nseqs_sim)
     LogMsg(self, "Time spent generating event reads: %s" % TimeSpent(start))
コード例 #17
0
ファイル: gtf_parser.py プロジェクト: ptraverse/gsc
class GTFAnnotationParserCls: #{
  def __init__(self, input_path, log_info=None): #{
    self.file = FileBoxCls(input_path, "r",
      "cannot read gene annotations input file")
    self.curr_feature = None
    self.log_info = log_info
    self.finished = False
  #} end def

  def __del__(self): #{
    self.close()
  #} end def

  def __iter__(self): #{
    return self
  #} end def

  def next(self): #{
    if (self.finished): #{
      raise StopIteration
    #} end if
    transcript = None
    try: #{
      if (None == self.curr_feature): #{
        self.ParseFeature()
      #} end if
      transcript = GTFTranscriptCls(name=self.curr_feature.name)
      while (self.curr_feature.name == transcript.name): #{
        transcript.Update(self.curr_feature)
        self.ParseFeature()
      #} end while
    except StopIteration:
      self.finished = True
    #} end try
    if (None == transcript): #{
      raise StopIteration
    #} end if
    transcript.CreateExonList()
    return transcript
  #} end def

  def ParseFeature(self): #{
    #ExtremeDebugMsg(self, "  Parsing feature from file...")
    try: #[
      line = self.file.next()
    except StopIteration, e:
      self.curr_feature = None
      raise e
    #} end try
    tokenizer = TokenizerCls(line, delimiter="\t", log_info=self.log_info)
    self.curr_feature = GTFFeatureCls(tokenizer)
コード例 #18
0
ファイル: predict_events.py プロジェクト: ptraverse/gsc
 def CreateOverlapsFile(self): #{
   if (self.options.use_existing_overlaps): #{
     LogMsg(self, "Using existing breakpoint/transcript overlaps file.")
   else:
     LogMsg(self, "Running overlap code...")
     if (hasattr(self, "log_file") and None != self.log_file): #{
       self.log_file.Flush()
     #} end if
     start = time.time()
     RunOverlapCode(self.options.breakpoint_exons,
       self.options.group_coords_path, self.options.overlaps_path,
       dpt=self.options.dpt)
     LogMsg(self, "Time spent running overlap code: %s" % TimeSpent(start))
   #} end if
   self.overlaps_file = FileBoxCls(self.options.overlaps_path, "r",
     "cannot read exon/group overlaps file")
コード例 #19
0
ファイル: annotation.py プロジェクト: ptraverse/gsc
 def __init__(self, path, type=None, log_info=None): #{
   if (None == type): #{
     type = GetAnnotationsType(path)
   #} end if
   if (type in PARSERS): #{
     self.parser = PARSERS[type](path, log_info=log_info)
     self.file = None
     self.ParseLine = None
   elif (type in PARSE_FUNCTIONS): #{
     self.parser = None
     self.file = FileBoxCls(path, "r",
     "cannot open %s annotations file" % type)
     self.ParseLine = PARSE_FUNCTIONS[type]
   else:
     raise GeneAnnotationError("cannot determine correct annotation parser "
       "from annotations type: %s" % type)
   #} end if
   self.log_info = log_info
   self.finished = False
コード例 #20
0
ファイル: with_tophat_fusion.py プロジェクト: ptraverse/gsc
class TopHatFileCls:  # {
    def __init__(self, path, log_info=None):  # {
        self.file = FileBoxCls(path, "r", "cannot read TopHat-Fusion results file")
        self.log_info = log_info

    # } end def

    def __del__(self):  # {
        self.close()

    # } end def

    def __iter__(self):  # {
        return self

    # } end def

    def next(self):  # {
        # the first line should start with "allAtOnce" and
        # it contains the breakpoint coordinates
        # parse the tophat line
        tophat_event = TopHatEventCls(self.file.next())
        # the next two lines should be "sequence" lines
        tophat_event.CheckSeqLine(self.file.next())
        tophat_event.CheckSeqLine(self.file.next())
        # the next lines should be... scores?
        tophat_event.CheckScoreLine(self.file.next())
        # the next line should have the gene ids
        tophat_event.ParseGenesLine(self.file.next())
        # skip the final line
        self.file.next()
        return tophat_event

    # } end def

    def close(self):  # {
        if hasattr(self, "file") and None != self.file and not self.file.closed:  # {
            self.file.close()
コード例 #21
0
  def IdentifyCandidateContigs(self, aligns): #{
    # TEMP # ExtremeDebugMsg(self, AlignListString(aligns)

    # open the contig sequences file if using the gap filter
    if (self.options.check_gap): #{
      fail_msg = "Cannot open contig sequence file"
      ctg_seq_file = FileBoxCls(self.paths['ctg_seq'], "r", fail_msg)
    else:
      ctg_seq_file = None
    #} end if

    # iterate over the alignments, grouping them by query (i.e. contig)
    contig_align_index = 0
    while (contig_align_index < len(aligns)): #{
      self.num_contigs += 1
      contig = ContigWithAlignmentsCls(contig_align_index, aligns,
        ctg_seq_file, self.paths['gap_out'], self.options, self.log_info)
      ExtremeDebugMsg(self, "-"*80)
      #DebugMsg(self, "Grouping alignments for "
      #  "%s (contig #%i)..." % (contig.id, self.num_contigs))
      DebugMsg(self, "%i) %s" % (self.num_contigs, contig.id))
      ExtremeDebugMsg(self, "  Contig length: %i" % contig.length)
      #LogMsg(self, "Contig align index: %i" % contig_align_index)

      # Select the alignments to consider for the current contig
      # and check for gapped alignments at the same time
      contig.SelectAlignments(aligns)
      if (contig.single_align_found): #{
        self.num_full_aligns += 1
      #} end if
      if (self.options.check_gap and not contig.perfect_align_found): #{
        contig.CheckGappedAlignments()
        self.gapped_psl_lines.extend(contig.gapped_psl_lines)
        self.num_gapped_aligns += contig.num_gapped_aligns
      #} end if
      if (self.params['check_split'] and
          not self.params['use_quick_chooser']):
        # pare down the alignment groups so that
        # only the best alignments remain
        contig.PareAlignmentGroups()
      #} end if

      #LogMsg(self, "# Gaps Found (Finder): %i" %
      #                         self.num_gapped_aligns)
      contig_align_index += contig.num_aligns_to_contig

      if (0 < len(contig.best_aligns)): #{
        if (self.log_info['debug']): #{
          LogMsg(self, "%i best aligns: %s" %
            (len(contig.best_aligns), contig.id))
          ExtremeDebugMsg(self, AlignListString(contig.best_aligns))
        #} end if
      elif (0 < len(contig.align_groups)): #{
        if (self.log_info['debug']): #{
          ExtremeDebugMsg(self, "-"*40)
          LogMsg(self, "%i align groups: %s" %
            (len(contig.align_groups), contig.id))
          for i, group in enumerate(contig.align_groups): #{
            ExtremeDebugMsg(self, "\n".join(["Group %i" % i,
              "  %i) S:%i E:%i Aligns:%i" % (i, group.ctg_start,
              group.ctg_end, len(group.best_aligns)),
              AlignListString(group.best_aligns)]))
          #} end for
        #} end if
      else: # no best aligns or align groups found
        if (not contig.perfect_align_found and
            not contig.single_align_found): #{
          DebugMsg(self, "No partial aligns selected: %s" % contig.id)
        #} end if
        continue
      #} end if

      # examine pairs of the chosen alignments
      if (self.params['use_quick_chooser']): #{
        self.ExamineBestAlignsPairwise(contig)
      else:
        self.ExamineAlignGroupsPairwise(contig)
      #} end if
    #} end while
    DebugMsg(self, "-"*80)
    # close the contig sequences file if using the gap filter
    if (self.options.check_gap): #{
      ctg_seq_file.close()
コード例 #22
0
 def Output(self, append): #{
   # open the output file in the appropriate mode
   if append: #{
     mode = "a"
   else:
     mode = "w"
   #} end if
   fail_msg = "Cannot open split alignment output file"
   out = FileBoxCls(self.paths['split_out'], mode, fail_msg)
   if (self.params['output_psl']): #{
     if (self.candidate_contigs[0].align1.method == "blat"): #{
       fail_msg = "Cannot open alignment psl output file"
       psl_out = FileBoxCls(self.paths['psl_out'], mode, fail_msg)
       DebugMsg(self,
         "Writing alignment lines to %s" % self.paths['psl_out'])
       # write out the alignment lines for the gapped alignment events found
       for psl_line in self.gapped_psl_lines: #{
         psl_out.Write(psl_line)
       #} end for
     else:
       # only write out psl lines for blat alignments
       self.params['output_psl'] = False
     #} end if
   #} end if
   # write the split alignment details to the output file
   for candidate_contig in self.candidate_contigs: #{
     # skip non-standard chromosomes
     #chr_patt = r"\A(chr)?(\d+|[XY]|MT?)\Z"
     #if (None == re.search(chr_patt, candidate_contig.align1.target) or
     #    None == re.search(chr_patt, candidate_contig.align2.target)):
     #if (NonStandardChr(candidate_contig.align1.target) or
     #    NonStandardChr(candidate_contig.align2.target)): #{
     #  DebugMsg(self, "Skipping non-standard chromosome: %s/%s" %
     #    (candidate_contig.align1.target, candidate_contig.align2.target))
     #  continue
     #} end if
     #if ("chr" != candidate_contig.align1.target[0:3]): #{
     #  candidate_contig.align1.target = ("chr%s" %
     #    candidate_contig.align1.target)
       #LogMsg(self, "  Target: %s" %
       #                        candidate_contig.align1.target)
       #msg = ("Improperly formatted alignment: %s" %
       #       candidate_contig.Details())
       #raise CandidateIdentifierError(msg)
     #} end if
     #if ("chr" != candidate_contig.align2.target[0:3]): #{
     #  candidate_contig.align2.target = ("chr%s" %
     #    candidate_contig.align2.target)
     #} end if
     candidate_contig.align1.target = AddChr(candidate_contig.align1.target)
     candidate_contig.align2.target = AddChr(candidate_contig.align2.target)
     ExtremeDebugMsg(self, "Writing line to %s:\n  %s" %
       (out.path, candidate_contig.Details()))
     out.WriteLine(candidate_contig.Details())
     if (self.params['output_psl']): #{
       psl_out.Write(candidate_contig.align1.psl())
       psl_out.Write(candidate_contig.align2.psl())
     #} end if
   #} end for
   out.close()
   if (self.params['output_psl']): #{
     psl_out.close()
コード例 #23
0
ファイル: annotation.py プロジェクト: ptraverse/gsc
class GeneAnnotationParserCls: #{
  def __init__(self, path, type=None, log_info=None): #{
    if (None == type): #{
      type = GetAnnotationsType(path)
    #} end if
    if (type in PARSERS): #{
      self.parser = PARSERS[type](path, log_info=log_info)
      self.file = None
      self.ParseLine = None
    elif (type in PARSE_FUNCTIONS): #{
      self.parser = None
      self.file = FileBoxCls(path, "r",
      "cannot open %s annotations file" % type)
      self.ParseLine = PARSE_FUNCTIONS[type]
    else:
      raise GeneAnnotationError("cannot determine correct annotation parser "
        "from annotations type: %s" % type)
    #} end if
    self.log_info = log_info
    self.finished = False
  #} end def

  def __del__(self): #{
    self.close()
  #} end def

  def __iter__(self): #{
    #if (None == self.parser): #{
    #  return self
    #else:
    #  return self.parser
    #} end if
    return self
  #} end def

  def next(self): #{
    if (self.finished): #{
      raise StopIteration
    #} end if
    #ExtremeDebugMsg(self, "Parsing annotation from file...")
    transcript = None
    try:
      if (None == self.parser): #{
        #ExtremeDebugMsg(self, "Using ParseLine function...")
        line = self.file.next()
        transcript = self.ParseLine(line)
      else:
        #ExtremeDebugMsg(self, "Using internal parser...")
        transcript = self.parser.next()
      #} end if
    except StopIteration:
      self.finished = True
    #} end try
    if (None == transcript): #{
      raise StopIteration
    #} end if
    transcript.gene_name = transcript.alias.replace(" ","_")
    transcript.transcript_id = transcript.name.replace(" ","_")
    #ExtremeDebugMsg(self, "Parsing transcript: %s (%s)" %
    #  (transcript.gene_name, transcript.transcript_id))
    return transcript
  #} end def

  def Close(self): #{
    for attr in ["file", "parser"]: #{
      if (hasattr(self, attr) and None != getattr(self, attr)): #{
        getattr(self, attr).close()
      #} end if
    #} end for
  #} end def

  def close(self): #{
    self.Close()
コード例 #24
0
ファイル: calculate.py プロジェクト: ptraverse/gsc
class P2GCalculatorCls: #{
  def __init__(self, options): #{
    SetupMainClass(self, options)
    CheckConfigCommands(self, "samtools")
    self.groups_file = None
    self.output_file = None
    self.options.use_chr = False
  #} end def

  def __del__(self): #{
    # close input and output files, if they are not already closed
    self.CloseFiles()
    CloseLogFile(self)
  #} end def

  def CalculateSupport(self): #{
    start = time.time()
    LogMsg(self, "Adding pair-to-genome support to groups...")
    # open the input and output files
    self.Setup()
    #ExtremeDebugMsg(self, "Should I use chr? %s" % self.options.use_chr)
    # for each group in the input file
    for group_line in self.groups_file: #{
      group_start = time.time()
      # create a group object from the line
      group = P2GGroupCls(self.options, self.log_info)
      group.ParseGroupLine(group_line)
      LogMsg(self, "Group: %i" % group.group_id)
      ExtremeDebugMsg(self, "  %s" % group.ToString())
      # get the pair-to-genome support for the current group
      group.GetPairToGenomeSupport()
      # write the pair-to-genome support for the current group
      self.WritePairToGenomeSupport(group.SupportString())
      ExtremeDebugMsg(self, "Time spent on group: %s" % TimeSpent(group_start))
    #} end for
    # close the input and output files
    self.CloseFiles()
    # remove the temporary samtools output files
    for end in ["", "_1", "_2"]: #{
      temp_sam_path = os.path.join(self.options.output_dir,
        "sam_out_tmp%s" % end)
      if (os.path.isfile(temp_sam_path)): #{
        os.remove(temp_sam_path)
      #} end if
      temp_sam_path += ".err"
      if (os.path.isfile(temp_sam_path)): #{
        os.remove(temp_sam_path)
      #} end if
    #} end for
    LogMsg(self, "Total time adding pair-to-genome support: %s" %
      TimeSpent(start))
  #} end def

  def Setup(self): #{
    fail_msg = "cannot open groups file"
    self.groups_file = FileBoxCls(self.options.barnacle_path, "r", fail_msg)
    output_file_path = self.options.barnacle_path.replace(".data", ".out")
    fail_msg = "cannot create pair-to-genome support output file"
    self.output_file = FileBoxCls(output_file_path, "w", fail_msg)
    # create samtools object and check whether to use "chr" in chromosome IDs
    samtools = SAMToolsCls(self.options.p2g_path, self.options,
      log_info=self.log_info)
    self.options.use_chr = samtools.ShouldChromUseChr()
  #} end def

  def WritePairToGenomeSupport(self, support_string): #{
    self.output_file.WriteLine("%s" % support_string)
  #} end def

  def CloseFiles(self): #{
    if (None != self.groups_file and not self.groups_file.closed): #{
      self.groups_file.close()
      self.groups_file = None
    #} end if
    if (None != self.output_file and not self.output_file.closed): #{
      self.output_file.close()
      self.output_file = None
コード例 #25
0
ファイル: integrate.py プロジェクト: ptraverse/gsc
 def __init__(self, path, log_info=None): #{
   self.log_info = log_info
   fail_msg = "cannot open read-to-contig support results file"
   self.file = FileBoxCls(path, "r", fail_msg)
   self.integrated = False
   self.curr_member = None
コード例 #26
0
ファイル: gtf_parser.py プロジェクト: ptraverse/gsc
 def __init__(self, input_path, log_info=None): #{
   self.file = FileBoxCls(input_path, "r",
     "cannot read gene annotations input file")
   self.curr_feature = None
   self.log_info = log_info
   self.finished = False
コード例 #27
0
ファイル: with_tophat_fusion.py プロジェクト: ptraverse/gsc
 def __init__(self, path, log_info=None):  # {
     self.file = FileBoxCls(path, "r", "cannot read TopHat-Fusion results file")
     self.log_info = log_info
コード例 #28
0
ファイル: predict_events.py プロジェクト: ptraverse/gsc
class EventPredictionCls: #{
  def __init__(self, options): #{
    SetupMainClass(self, options)
    if (not hasattr(self.options, "realign")): #{
      self.options.realign = False
    #} end if
    if (self.options.realign): #{
      CheckConfigCommands(self, "blat")
    #} end if
    self.predictors = dict()
    if (self.options.predict_fusions): #{
      predictor = FusionPredictorCls(options, log_info=self.log_info)
      self.predictors[predictor.key] = predictor
    #} end if
    if (self.options.predict_ptds): #{
      predictor = PTDPredictorCls(options, log_info=self.log_info)
      self.predictors[predictor.key] = predictor
    #} end if
    if (self.options.predict_itds): #{
      predictor = ITDPredictorCls(options, log_info=self.log_info)
      self.predictors[predictor.key] = predictor
    #} end if
    self.use_chr = False
    #self.postpone_gene_check = False
  #} end def

  def __del__(self): #{
    CloseLogFile(self)
  #} end def

  def PredictEvents(self): #{
    LogMsg(self, "Predicting events...")
    start = time.time()
    # get the reference gene names, if a path is given
    #self.ref_gene_names = GetGeneNamesFromFile(self.options.gene_names_path,
    #  self.log_info)
    group_parser = CandidateGroupParserCls(self.options.barnacle_path)
    # recheck breakpoint exons
    self.RecheckBreakpointExons(group_parser)
    realigner = None
    if (self.options.realign): #{
      realigner = RealignerCls(self.options, self.log_info)
    #} end if
    # potential_events[bio_type][group_id] = event and gene sets object
    #potential_events = dict([(predictor.key, dict()) for
    #  predictor in self.predictors])
    LogMsg(self, "Processing candidate groups...")
    process_start = time.time()
    for group in group_parser: #{
      # get the breakpoint exons for the group
      self.GetBreakpointExons(group)
      # check whether the event is any biologically typed event
      #self.CheckEvent(group, output_files, lib_info.lib_name, potential_events)
      # attempt to predict events of each specified type
      # from the current candidate group
      for predictor in self.predictors.itervalues(): #{
        good_members = list()
        if (predictor.ProcessGroup(group, good_members) and
            None != realigner): #{
          #realigner.UpdateContigs(group, good_members, predictor.store_seq)
          realigner.UpdateContigs(group, good_members, predictor.key)
        #} end if
      #} end for
    #} end for
    LogMsg(self, "Time spent processing candidate groups: %s" %
      TimeSpent(process_start))
    if ("itd" in self.predictors and
        0 < self.predictors["itd"].num_over_aligned): #{
      LogMsg(self, "WARNING: %i gap candidates have aligned length greater "
        "than gap length!" % self.predictors["itd"].num_over_aligned)
    #} end if
    #if ('event_coords' in output_files): #{
    #  output_files['event_coords'].Close()
    #  self.RecheckExonOverlap(output_files, potential_events, lib_info.lib_name)
    #} end if
    if (None != realigner and 0 < len(realigner.contigs)): #{
      realigner.RealignContigs()
      LogMsg(self, "Before realignment:")
      for predictor in self.predictors.itervalues(): #{
        LogMsg(self, "  Number of %s predictions: %i" %
          (predictor.description, predictor.num_predictions))
        if (0 == predictor.num_predictions): #{
          continue
        #} end if
        if ("itd" in predictor.key or "fusion" in predictor.key): #{
          predictor.LoadTranscriptSequences(realigner.contigs)
        #} end if
        predictor.ReprocessPredictions(realigner.contigs)
        #predictor.ReprocessPredictions(realigner.contigs,
        #  realigner.contig_seqs)
      #} end for
      LogMsg(self, "%s\nAfter realignment:" % ("-"*40))
    #} end if
    for predictor in self.predictors.itervalues(): #{
      LogMsg(self, "Number of %s predictions: %i" %
        (predictor.description, predictor.num_predictions))
    #} end for
    LogMsg(self, "Time spent predicting events: %s" % TimeSpent(start))
  #} end def

  #def CreateOutputFiles(self, input_path): #{
  #  input_file_name = os.path.basename(input_path)
  #  input_root = os.path.splitext(input_file_name)[0]
  #  output_files = dict()
  #  # setup the coordinates file for rechecking exon overlaps
  #  self.SetupEventCoordsFile(input_root, output_files)
  #  return output_files
  #} end def

  def RecheckBreakpointExons(self, group_parser): #{
    if (None == self.options.breakpoint_exons): #{
      self.overlaps_file = None
      return
    #} end if
    if (self.options.use_existing_group_coords): #{
      LogMsg(self, "Using existing group coordinates file.")
    else:
      group_coords_file = self.CreateGroupCoordsFile()
      for group in group_parser: #{
        #ExtremeDebugMsg(self, "Writing coordinates for group %i" % group.id)
        self.WriteGroupCoords(group, group_coords_file)
      #} end for
      group_parser.Close()
      group_coords_file.Close()
    #} end if
    self.CreateOverlapsFile()
  #} end def

  #def SetupEventCoordsFile(self, input_root, output_files): #{
  def CreateGroupCoordsFile(self): #{
    # check whether to use "chr" in chromosome names in coordinates file
    self.use_chr = ShouldChromUseChr(1, self.options.breakpoint_exons,
      "exon coordinates", self.log_info)
    # open the group coordinates file
    #output_files['event_coords'] = FileBoxCls(group_coords_path, "w",
    group_coords_file = FileBoxCls(self.options.group_coords_path, "w",
      "cannot create event coordinates file")
    #self.postpone_gene_check = True
    return group_coords_file
  #} end def

  #def WriteEventCoords(self, event, group_coords_file): #{
  def WriteGroupCoords(self, event, group_coords_file): #{
    for member in event.members: #{
      if (member.gap): #{
        # write gap event coordinates
        self.WriteGapGroupCoords(member, group_coords_file)
      else:
        # write split event coordinates
        self.WriteSplitGroupCoords(member, group_coords_file)
      #} end if
    #} end for
  #} end def

  def WriteGapGroupCoords(self, member, group_coords_file): #{
    gap_coords = GroupCoordsCls(
      member.align_info_B.chrom,
      min(member.align_info_B.genome_start, member.align_info_B.genome_end),
      max(member.align_info_B.genome_start, member.align_info_B.genome_end),
      "%sA" % member.IDString(),
      self.use_chr
    )
    group_coords_file.WriteLine("%s" % gap_coords.ToString())
  #} end def

  def WriteSplitGroupCoords(self, member, group_coords_file): #{
    split_coords_A = GroupCoordsCls(
      member.align_info_A.chrom,
      member.align_info_A.genome_end - self.options.event_buffer,
      member.align_info_A.genome_end + self.options.event_buffer,
      "%sA" % member.IDString(),
      self.use_chr
    )
    group_coords_file.WriteLine("%s" % split_coords_A.ToString())
    split_coords_B = GroupCoordsCls(
      member.align_info_B.chrom,
      member.align_info_B.genome_start - self.options.event_buffer,
      member.align_info_B.genome_start + self.options.event_buffer,
      "%sB" % member.IDString(),
      self.use_chr
    )
    group_coords_file.WriteLine("%s" % split_coords_B.ToString())
  #} end def

  #def RecheckExonOverlap(self, output_files, potential_events, lib_name): #{
  #  LogMsg(self, "Rechecking exon overlap...")
  #  start = time.time()
  #  # run overlap code
  #  overlaps_path = self.RunOverlapCode(output_files['group_coords'].path)
  #  try: #{
  #    # parse overlap code output
  #    self.ParseOverlapResults(overlaps_path, potential_events)
  #  except ACEventGroupError, e:
  #    raise EventPredictionError("error parsing overlap file: %s" % e)
  #  #} end try
  #  self.ProcessPotentialEvents(potential_events, output_files, lib_name)
  #  LogMsg(self, "Time spent rechecking exon overlaps: %s" % TimeSpent(start))
  #} end def

  def CreateOverlapsFile(self): #{
    if (self.options.use_existing_overlaps): #{
      LogMsg(self, "Using existing breakpoint/transcript overlaps file.")
    else:
      LogMsg(self, "Running overlap code...")
      if (hasattr(self, "log_file") and None != self.log_file): #{
        self.log_file.Flush()
      #} end if
      start = time.time()
      RunOverlapCode(self.options.breakpoint_exons,
        self.options.group_coords_path, self.options.overlaps_path,
        dpt=self.options.dpt)
      LogMsg(self, "Time spent running overlap code: %s" % TimeSpent(start))
    #} end if
    self.overlaps_file = FileBoxCls(self.options.overlaps_path, "r",
      "cannot read exon/group overlaps file")
    #self.GetNextExonOverlap()
  #} end def

  def GetBreakpointExons(self, group): #{
    if (not hasattr(self, "overlaps_file") or None == self.overlaps_file): #{
      return
    #} end if
    ExtremeDebugMsg(self, "Getting breakpoint exons for group %i" % group.id)
    # clear any previous breakpoint genes
    group.ClearBPGenes()
    if (not hasattr(self, "curr_overlap")): #{
      self.curr_overlap = None
    #} end if
    # skip overlaps for groups that come before the current group
    while (None == self.curr_overlap or
        self.curr_overlap.group_id < group.id): #{
      try: #{
        self.GetNextExonOverlap()
      except StopIteration:
        return
      #} end try
    #} end while
    # create a dictionary of the members of the current group
    members_dict = dict()
    for member in group.members: #{
      members_dict[member.candidate_id] = member
    #} end for
    # get all overlaps for the current group
    while (self.curr_overlap.group_id == group.id): #{
      if (self.curr_overlap.member_id in members_dict): #{
        self.AddBreakPointGene(members_dict[self.curr_overlap.member_id])
      #} end if
      try: #{
        self.GetNextExonOverlap()
      except StopIteration:
        return
      #} end try
    #} end while
  #} end def

  def GetNextExonOverlap(self): #{
    if (not hasattr(self, "overlaps_file") or None == self.overlaps_file): #{
      ExtremeDebugMsg(self, "Setting current overlap to \"None\".")
      self.curr_overlap = None
      return
    #} end if
    overlap = ExonOverlapCls(self.overlaps_file.next())
    self.curr_overlap = overlap
    ExtremeDebugMsg(self, "Current overlap = G%i%s r%s exons: %s" %
      (overlap.group_id, overlap.member_id, overlap.region_id,
      ",".join(overlap.exons)))
  #} end def

  def AddBreakPointGene(self, member): #{
    if (None == self.curr_overlap): #{
      return
    #} end if
    if (self.curr_overlap.group_id != member.group_id): #{
      raise EventPredictionError("Group ID: %i does not match overlap ID: %i" %
        (member.group_id, self.curr_overlap.group_id))
    #} end if
    if (self.curr_overlap.member_id != member.candidate_id): #{
      raise EventPredictionError("Candidate ID: %s " % member.candidate_id +
        "does not match overlap ID: %s" % self.curr_overlap.member_id)
    #} end if
    member.AddGenes("breakpoint_%s" % self.curr_overlap.region_id,
      self.curr_overlap.exons)
コード例 #29
0
class CandidateGroupParserCls: #{
  def __init__(self, data_file_path, keep_lines=False, check_data=False): #{
    CheckFilePath(data_file_path, "candidate group file")
    self.group_parser = GroupParserCls(keep_lines=keep_lines)
    self.check_data = check_data
    fail_message = "cannot open data file"
    self.data_file = FileBoxCls(data_file_path, "r", fail_message)
    self.groups = list()
  #} end def

  def __del__(self): #{
    # close data file if it is open
    self.CloseDataFile()
  #} end def

  def __iter__(self): #{
    return self
  #} end def

  # Load the entire data file into memory
  # Do not mix with using GetNextGroup() method
  def ParseDataFile(self): #{
    #self.OpenDataFile()
    for group_line in self.data_file: #{
      #group_line = CleanLine(group_line)
      # skip blank lines
      #if ("" == group_line): #{
      #  continue
      #} end if
      self.groups.append(self.group_parser.ParseGroup \
        (group_line, self.data_file, check_data=self.check_data))
    #} end for
    self.CloseDataFile()
    return self.groups
  #} end def

  # Load a single group from the data file into memory
  # Do not mix with using ParseDataFile() method
  def GetNextGroup(self): #{
    return self.next()
  #} end def

  def next(self): #{
    #if (None == self.data_file): #{
    #  self.OpenDataFile()
    #} end if
    group_line = ""
    # skip blank lines
    while ("" == group_line): #{
      #group_line = CleanLine(self.data_file.next())
      group_line = self.data_file.next()
    #} end if
    return self.group_parser.ParseGroup \
      (group_line, self.data_file, check_data=self.check_data)
  #} end def

  def Close(self): #{
    self.CloseDataFile()
  #} end def

  def CloseDataFile(self): #{
    if (not hasattr(self, "data_file")): #{
      return
    #} end if
    if (None == self.data_file): #{
      return
    #} end if
    if (self.data_file.closed): #{
      return
    #} end if
    self.data_file.close()
    #self.data_file = None
  #} end def

  def close(self): #{
    self.CloseDataFile()
  #} end def

  def GroupLine(self): #{
    if (not self.group_parser.keep_lines): #{
      raise CandidateGroupParserError \
        ("cannot get group line when keep_lines flag was not set")
    #} end if
    return self.group_parser.group_line
  #} end def

  def MemberLines(self): #{
    if (not self.group_parser.keep_lines): #{
      raise CandidateGroupParserError \
        ("cannot get member lines when keep_lines flag was not set")
    #} end if
    return self.group_parser.member_lines