Example #1
0
def main():
  parser = argparse.ArgumentParser(description='Auto translate jp CCs in MPEG TS file.')
  parser.add_argument('infile', help='Input filename (MPEG2 Transport Stream File)', type=str)
  parser.add_argument('pid', help='Pid of closed caption ES to extract from stream.', type=int)
  #parser.add_argument('-k', '--secret_key', help='Windows secret key for bing translate API.', type=str, default='')
  args = parser.parse_args()

  pid = args.pid
  infilename = args.infile
  if not os.path.exists(infilename):
    print 'Please provide input Transport Stream file.'
    os.exit(-1)

  #open an Ass file and formatter
  ass_file = ASSFile(infilename+'_ENG.ass')
  ass = ASSFormatter(ass_file)

  #CC data is not, in itself timestamped, so we've got to use packet info
  #to reconstruct the timing of the closed captions (i.e. how many seconds into
  #the file are they shown?)
  initial_timestamp = 0
  pes_packet = None
  pes = []
  elapsed_time_s = 0
  for packet in next_ts_packet(infilename):
    #always process timestamp info, regardless of PID
    if packet.adapatation_field() and packet.adapatation_field().PCR():
      current_timestamp = packet.adapatation_field().PCR()
      initial_timestamp = initial_timestamp or current_timestamp
      delta = current_timestamp - initial_timestamp
      elapsed_time_s = float(delta)/90000.0

    #if this is the stream PID we're interestd in, reconstruct the ES
    if packet.pid() == pid:
      if packet.payload_start():
        pes = copy.deepcopy(packet.payload())
      else:
        pes.extend(packet.payload())
      pes_packet = PESPacket(pes)
      

      #if our packet is fully formed (payload all present) we can parse its contents
      if pes_packet.length() == (pes_packet.header_size() + pes_packet.payload_size()):
        
        data_group = DataGroup(pes_packet.payload())

        if not data_group.is_management_data():
        #We now have a Data Group that contains caption data.
        #We take out its payload, but this is further divided into 'Data Unit' structures
          caption = data_group.payload()
          #iterate through the Data Units in this payload via another generator.
          for data_unit in next_data_unit(caption):
            #we're only interested in those Data Units which are "statement body" to get CC data.
            if not isinstance(data_unit.payload(), StatementBody):
              continue

            ass.format(data_unit.payload().payload(), elapsed_time_s)
Example #2
0
def OnESPacket(current_pid, packet, header_size):
  """
  Callback invoked on the successful extraction of an Elementary Stream packet from the
  Transport Stream file packets.
  :param current_pid: The TS Program ID for the TS packets this info originated from
  :param packet: The ENTIRE ES packet, header and payload-- which may have been assembled
    from multiple TS packet payloads.
  :param header_size: Size of the header in bytes (characters in the string). Provided to more
    easily separate the packet into header and payload.
  :return: None
  """
  global pid
  global VERBOSE
  global SILENT
  global elapsed_time_s

  if pid >= 0 and current_pid != pid:
    return

  try:
    payload = ES.get_pes_payload(packet)
    f = list(payload)
    #f = bytearray(payload)
    data_group = DataGroup(f)
    if not data_group.is_management_data():
      #We now have a Data Group that contains caption data.
      #We take out its payload, but this is further divided into 'Data Unit' structures
      caption = data_group.payload()
      #iterate through the Data Units in this payload via another generator.
      for data_unit in next_data_unit(caption):
        #we're only interested in those Data Units which are "statement body" to get CC data.
        if not isinstance(data_unit.payload(), StatementBody):
          continue
        #okay. Finally we've got a data unit with CC data. Feed its payload to the custom
        if pid < 0 and VERBOSE and not SILENT:
          pid = current_pid
          print("Found Closed Caption data in PID: " + str(pid))
          print("Will now only process this PID to improve performance.")

        #formatter function above. This dumps the basic text to stdout.
        cc = formatter(data_unit.payload().payload(), elapsed_time_s)
        if cc and VERBOSE:
          #according to best practice, always deal internally with UNICODE, and encode to
          #your encoding of choice as late as possible. Here, i'm encoding as UTF-8 for
          #my command line.
          #DECODE EARLY, ENCODE LATE
          print(cc.encode('utf-8'))
  except EOFError:
    pass
  except Exception, err:
    if VERBOSE and not SILENT and pid >= 0:
      print("Exception thrown while handling DataGroup in ES. This may be due to many factors"
         + "such as file corruption or the .ts file using as yet unsupported features.")
      traceback.print_exc(file=sys.stdout)
Example #3
0
def OnESPacket(current_pid, packet, header_size):
  """
  Callback invoked on the successful extraction of an Elementary Stream packet from the
  Transport Stream file packets.
  :param current_pid: The TS Program ID for the TS packets this info originated from
  :param packet: The ENTIRE ES packet, header and payload-- which may have been assembled
    from multiple TS packet payloads.
  :param header_size: Size of the header in bytes (characters in the string). Provided to more
    easily separate the packet into header and payload.
  :return: None
  """
  global pid
  global VERBOSE
  global SILENT
  global elapsed_time_s
  global ass
  global infilename
  global outfilename
  global tmax
  global time_offset

  if pid >= 0 and current_pid != pid:
    return

  try:
    payload = ES.get_pes_payload(packet)
    f = list(payload)
    data_group = DataGroup(f)
    if not data_group.is_management_data():
      #We now have a Data Group that contains caption data.
      #We take out its payload, but this is further divided into 'Data Unit' structures
      caption = data_group.payload()
      #iterate through the Data Units in this payload via another generator.
      for data_unit in next_data_unit(caption):
        #we're only interested in those Data Units which are "statement body" to get CC data.
        if not isinstance(data_unit.payload(), StatementBody):
          continue

        if not ass:
          v = not SILENT
          ass = ASSFormatter(tmax=tmax, video_filename=outfilename, verbose=v)

        ass.format(data_unit.payload().payload(), elapsed_time_s)

        # this code used to sed the PID we're scanning via first successful ARIB decode
        # but i've changed it below to draw present CC language info form ARIB
        # management data. Leaving this here for reference.
        #if pid < 0 and not SILENT:
        #  pid = current_pid
        #  print("Found Closed Caption data in PID: " + str(pid))
        #  print("Will now only process this PID to improve performance.")

    else:
      # management data
      management_data = data_group.payload()
      numlang = management_data.num_languages()
      if pid < 0 and numlang > 0:
        for language in range(numlang):
          if not SILENT:
            print("Closed caption management data for language: "
              + management_data.language_code(language)
              + " available in PID: " + str(current_pid))
            print("Will now only process this PID to improve performance.")
        pid = current_pid

  except EOFError:
    pass
  except FileOpenError as ex:
    # allow IOErrors to kill application
    raise ex
  except Exception, err:
    if not SILENT and pid >= 0:
      print("Exception thrown while handling DataGroup in ES. This may be due to many factors"
        + "such as file corruption or the .ts file using as yet unsupported features.")
      traceback.print_exc(file=sys.stdout)
Example #4
0
def main():
  parser = argparse.ArgumentParser(description='Draw CC Packets from MPG2 Transport Stream file.')
  parser.add_argument('infile', help='Input filename (MPEG2 Transport Stream File)', type=str)
  parser.add_argument('pid', help='Pid of closed caption ES to extract from stream.', type=int)
  args = parser.parse_args()

  pid = args.pid
  infilename = args.infile
  if not os.path.exists(infilename):
    print 'Please provide input Transport Stream file.'
    os.exit(-1)

  #open an Ass file and formatter
  ass_file = ASSFile(infilename+'.ass')
  ass = ASSFormatter(ass_file)

  #CC data is not, in itself timestamped, so we've got to use packet info
  #to reconstruct the timing of the closed captions (i.e. how many seconds into
  #the file are they shown?)
  initial_timestamp = 0
  pes_packet = None
  pes = []
  elapsed_time_s = 0
  for packet in next_ts_packet(infilename):
    #always process timestamp info, regardless of PID
    if packet.adapatation_field() and packet.adapatation_field().PCR():
      current_timestamp = packet.adapatation_field().PCR()
      initial_timestamp = initial_timestamp or current_timestamp
      delta = current_timestamp - initial_timestamp
      elapsed_time_s = float(delta)/90000.0

    #if this is the stream PID we're interestd in, reconstruct the ES
    if packet.pid() == pid:
      try:
          if packet.payload_start():
            pes = copy.deepcopy(packet.payload())
          else:
            pes.extend(packet.payload())
          pes_packet = PESPacket(pes)


          #if our packet is fully formed (payload all present) we can parse its contents
          if pes_packet.length() == (pes_packet.header_size() + pes_packet.payload_size()):

            data_group = DataGroup(pes_packet.payload())

            if not data_group.is_management_data():
            #We now have a Data Group that contains caption data.
            #We take out its payload, but this is further divided into 'Data Unit' structures
              caption = data_group.payload()
              #iterate through the Data Units in this payload via another generator.
              for data_unit in next_data_unit(caption):
                #we're only interested in those Data Units which are "statement body" to get CC data.
                if not isinstance(data_unit.payload(), StatementBody):
                  continue

                ass.format(data_unit.payload().payload(), elapsed_time_s)
                #okay. Finally we've got a data unit with CC data. Feed its payload to the custom
                #formatter function above. This dumps the basic text to stdout.
                #cc = formatter(data_unit.payload().payload(), elapsed_time_s)
                #if cc:
                  #according to best practice, always deal internally with UNICODE, and encode to
                  #your encoding of choice as late as possible. Here, i'm encoding as UTF-8 for
                  #my command line.
                  #DECODE EARLY, ENCODE LATE
                  #print(cc.encode('utf-8'))
      except:
          pass
Example #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Remove ARIB formatted Closed Caption information from an MPEG TS file and format the results as a standard .ass subtitle file.'
    )
    parser.add_argument('infile',
                        help='Input filename (MPEG2 Transport Stream File)',
                        type=str)
    parser.add_argument(
        '-p',
        '--pid',
        help=
        'Specify a PID of a PES known to contain closed caption info (tool will attempt to find the proper PID if not specified.).',
        type=int,
        default=-1)
    parser.add_argument('-v',
                        '--verbose',
                        help='Verbose output.',
                        action='store_true')
    parser.add_argument('-q',
                        '--quiet',
                        help='Does not write to stdout.',
                        action='store_true')
    parser.add_argument('-t',
                        '--tmax',
                        help='Subtitle display time limit (seconds).',
                        type=int,
                        default=5)
    parser.add_argument(
        '-o',
        '--timeoffset',
        help=
        'Shift all time values in generated .ass file by indicated floating point offset in seconds.',
        type=float,
        default=0.0)
    args = parser.parse_args()

    pid = args.pid
    infilename = args.infile
    quiet = args.quiet
    verbose = args.verbose
    tmax = args.tmax
    time_offset = args.timeoffset

    if not os.path.exists(infilename):
        print 'Input filename :' + infilename + " does not exist."
        os.exit(-1)

    #open an Ass file and formatter
    ass_file = None  #ASSFile(infilename+'.ass')
    ass = None  #ASSFormatter(ass_file, tmax=tmax)

    #CC data is not, in itself timestamped, so we've got to use packet info
    #to reconstruct the timing of the closed captions (i.e. how many seconds into
    #the file are they shown?)
    initial_timestamp = 0
    pes_packet = None
    pes = []
    elapsed_time_s = 0
    # get filesize for progress meter
    total_filesize = os.path.getsize(infilename)
    read_size = 0
    percent_read = 0
    prev_percent_read = percent_read
    if not quiet and not verbose:
        #show initial progress information
        sys.stdout.write("progress: %d%%   \r" % (percent_read))
        sys.stdout.flush()

    for packet in next_ts_packet(infilename):
        read_size += TSPacket.PACKET_SIZE_BYTES
        percent_read = ((read_size / float(total_filesize)) * 100)
        new_percent_read = int(percent_read * 100)
        if not quiet and not verbose and new_percent_read != prev_percent_read:
            prev_percent_read = new_percent_read
            #print("totalsize:"+str(total_filesize)+" read_size "+str(read_size) + " percent: " + str(new_percent_read))
            sys.stdout.write("progress: %.2f%%   \r" % (percent_read))
            sys.stdout.flush()

        #always process timestamp info, regardless of PID
        if packet.adapatation_field() and packet.adapatation_field().PCR():
            current_timestamp = packet.adapatation_field().PCR()
            initial_timestamp = initial_timestamp or current_timestamp
            delta = current_timestamp - initial_timestamp
            elapsed_time_s = float(delta) / 90000.0 + time_offset

        #if this is the stream PID we're interestd in, reconstruct the ES
        if pid < 0 or (pid == packet.pid()):
            try:
                if packet.payload_start():
                    pes = copy.deepcopy(packet.payload())
                else:
                    pes.extend(packet.payload())
                pes_packet = PESPacket(pes)

                #if our packet is fully formed (payload all present) we can parse its contents
                if pes_packet.length() == (pes_packet.header_size() +
                                           pes_packet.payload_size()):

                    data_group = DataGroup(pes_packet.payload())

                    if not data_group.is_management_data():
                        #We now have a Data Group that contains caption data.
                        #We take out its payload, but this is further divided into 'Data Unit' structures
                        caption = data_group.payload()
                        #iterate through the Data Units in this payload via another generator.
                        for data_unit in next_data_unit(caption):
                            #we're only interested in those Data Units which are "statement body" to get CC data.
                            if not isinstance(data_unit.payload(),
                                              StatementBody):
                                continue

                            # only write the file if we've actually found some Closed Captions
                            if not ass_file:
                                ass_file = ASSFile(infilename + '.ass')
                            if not ass:
                                ass = ASSFormatter(ass_file,
                                                   tmax=tmax,
                                                   video_filename=infilename)

                            ass.format(data_unit.payload().payload(),
                                       elapsed_time_s)
                            if pid < 0:
                                pid = packet.pid()
                                print("Found Closed Caption data in PID: " +
                                      str(pid))
                                print(
                                    "Will now only process this PID to improve performance."
                                )
                            #print("properly formed packet with pid: "+ str(packet.pid()))
                            #okay. Finally we've got a data unit with CC data. Feed its payload to the custom
                            #formatter function above. This dumps the basic text to stdout.
                            #cc = formatter(data_unit.payload().payload(), elapsed_time_s)
                            #if cc:
                            #according to best practice, always deal internally with UNICODE, and encode to
                            #your encoding of choice as late as possible. Here, i'm encoding as UTF-8 for
                            #my command line.
                            #DECODE EARLY, ENCODE LATE
                            #print(cc.encode('utf-8'))
            except:
                #print("exception thrown on packet with PID: " + str(packet.pid()))
                pass
    if pid < 0 or not ass:
        print("Did not find any Closed Caption data in the file " + infilename)
Example #6
0
def OnESPacket(current_pid, packet, header_size):
  """
  Callback invoked on the successful extraction of an Elementary Stream packet from the
  Transport Stream file packets.
  :param current_pid: The TS Program ID for the TS packets this info originated from
  :param packet: The ENTIRE ES packet, header and payload-- which may have been assembled
    from multiple TS packet payloads.
  :param header_size: Size of the header in bytes (characters in the string). Provided to more
    easily separate the packet into header and payload.
  :return: None
  """
  global pid
  global VERBOSE
  global SILENT
  global elapsed_time_s
  global ass
  global infilename
  global outfilename
  global tmax
  global time_offset

  if pid >= 0 and current_pid != pid:
    return

  try:
    payload = ES.get_pes_payload(packet)
    f = list(payload)
    data_group = DataGroup(f)
    if not data_group.is_management_data():
      #We now have a Data Group that contains caption data.
      #We take out its payload, but this is further divided into 'Data Unit' structures
      caption = data_group.payload()
      #iterate through the Data Units in this payload via another generator.
      for data_unit in next_data_unit(caption):
        #we're only interested in those Data Units which are "statement body" to get CC data.
        if not isinstance(data_unit.payload(), StatementBody):
          continue

        if not ass:
          v = not SILENT
          ass = ASSFormatter(tmax=tmax, video_filename=outfilename, verbose=v)

        ass.format(data_unit.payload().payload(), elapsed_time_s)

        # this code used to sed the PID we're scanning via first successful ARIB decode
        # but i've changed it below to draw present CC language info form ARIB
        # management data. Leaving this here for reference.
        #if pid < 0 and not SILENT:
        #  pid = current_pid
        #  print("Found Closed Caption data in PID: " + str(pid))
        #  print("Will now only process this PID to improve performance.")

    else:
      # management data
      management_data = data_group.payload()
      numlang = management_data.num_languages()
      if pid < 0 and numlang > 0:
        for language in range(numlang):
          if not SILENT:
            print("Closed caption management data for language: "
              + management_data.language_code(language)
              + " available in PID: " + str(current_pid))
            print("Will now only process this PID to improve performance.")
        pid = current_pid

  except EOFError:
    pass
  except FileOpenError as ex:
    # allow IOErrors to kill application
    raise ex
  except Exception, err:
    if not SILENT and pid >= 0:
      print("Exception thrown while handling DataGroup in ES. This may be due to many factors"
        + "such as file corruption or the .ts file using as yet unsupported features.")
      traceback.print_exc(file=sys.stdout)
Example #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Auto translate jp CCs in MPEG TS file.')
    parser.add_argument('infile',
                        help='Input filename (MPEG2 Transport Stream File)',
                        type=str)
    parser.add_argument(
        'pid',
        help='Pid of closed caption ES to extract from stream.',
        type=int)
    #parser.add_argument('-k', '--secret_key', help='Windows secret key for bing translate API.', type=str, default='')
    args = parser.parse_args()

    pid = args.pid
    infilename = args.infile
    if not os.path.exists(infilename):
        print 'Please provide input Transport Stream file.'
        os.exit(-1)

    #open an Ass file and formatter
    ass_file = ASSFile(infilename + '_ENG.ass')
    ass = ASSFormatter(ass_file)

    #CC data is not, in itself timestamped, so we've got to use packet info
    #to reconstruct the timing of the closed captions (i.e. how many seconds into
    #the file are they shown?)
    initial_timestamp = 0
    pes_packet = None
    pes = []
    elapsed_time_s = 0
    for packet in next_ts_packet(infilename):
        #always process timestamp info, regardless of PID
        if packet.adapatation_field() and packet.adapatation_field().PCR():
            current_timestamp = packet.adapatation_field().PCR()
            initial_timestamp = initial_timestamp or current_timestamp
            delta = current_timestamp - initial_timestamp
            elapsed_time_s = float(delta) / 90000.0

        #if this is the stream PID we're interestd in, reconstruct the ES
        if packet.pid() == pid:
            if packet.payload_start():
                pes = copy.deepcopy(packet.payload())
            else:
                pes.extend(packet.payload())
            pes_packet = PESPacket(pes)

            #if our packet is fully formed (payload all present) we can parse its contents
            if pes_packet.length() == (pes_packet.header_size() +
                                       pes_packet.payload_size()):

                data_group = DataGroup(pes_packet.payload())

                if not data_group.is_management_data():
                    #We now have a Data Group that contains caption data.
                    #We take out its payload, but this is further divided into 'Data Unit' structures
                    caption = data_group.payload()
                    #iterate through the Data Units in this payload via another generator.
                    for data_unit in next_data_unit(caption):
                        #we're only interested in those Data Units which are "statement body" to get CC data.
                        if not isinstance(data_unit.payload(), StatementBody):
                            continue

                        ass.format(data_unit.payload().payload(),
                                   elapsed_time_s)