Ejemplo n.º 1
0
  def test_html_scan(self):
    self.test_dumpData()

    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    #sub_scan.addFile('https://www.coursera.org/')
    sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32)
    pass
Ejemplo n.º 2
0
  def test_txt_scan(self):
    self.test_dumpData()
    
    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518)
    '''
    assert len(sub_scan.lexicon) == 3929
    assert len(sub_scan.stem_lexicon) == 2968
    assert len(sub_scan.wordSet) <= 1807
    assert len(sub_scan.stem_newWords) <= 922
    assert len(sub_scan.newWords) <= 531
    '''

    names_mv=['Yuri', 'Simeon']
    for n in names_mv:
      assert n in sub_scan.nameSet
      '''
      if n in sub_scan.nameSet:
        print n;
        pass
      '''
      pass
    #print sub_scan.nameSet
    for n in names_mv:
      '''
      if n.lower() in sub_scan.newWords:
        print n;
        pass
      '''
      assert n.lower() not in sub_scan.newWords
      pass
    pass
Ejemplo n.º 3
0
def main(argv=None, logger=None):
  if(logger is None):
    logger=createLog(logname="subtitle",level=logging.INFO)

  fname=None

  startDtime=datetime.now()
  print "Start time: "+str(startDtime)#.strftime("%Y-%m-%d %H:%M:%S"))
  print
  #sub=Subtitle(logging.getLogger())
  sub=Subtitle(logger)

  try:
    opts, args=getopt.getopt(
      argv, 
      "hvf:w:t:d:p:?lm:WDc", 
      ["help", "version", "checkup" "file=","word=","type=","dir=","pickle=","limit="])
    #print opts, args
    logger.info("opts:{0};args:{1}".format(opts, args))
  except getopt.GetoptError as msg:
    print "error happened when get options!!! error:{0}".format(msg)
    usage()
    logger.error("getopt.GetoptError:{0}, exit!".format(msg))
    sys.exit(2)
  except Exception as msg:
    logger.error("error:{0}, exit!".format(msg))
    sys.exit(2)

  _is_lines_show=False
  _is_words_show=False
  sub_type = ""
  words_limit=None
  for opt, arg in opts:
    if(opt in ("-?","-h", "--help")):
      usage()
      sys.exit()
      pass
    elif(opt in ("-v", "--version")):
      version()
      sys.exit()
      pass
    elif(opt in ("-c", "--checkup")):
      sub.checkup=True
      pass
    elif(opt in ("-d", "--dir")):
      print "Sorry, -d --dir option still not offer"
      sys.exit()
      pass
    elif(opt in ("-p", "--pickle")):
      pkl=arg
      sub.setLexiconFile(pkl)
      pass
    elif(opt in ('-f',"--file")):
      fname= arg
      sub.addFile(fname)
      pass
    elif(opt == '-D'):
      logger.setLevel(logging.DEBUG)
      sub.setLogger(logger)
      pass
    elif(opt in ("-w", "--word")):
      word = arg
      sub.addWord(word)
      pass
    elif(opt in ("-t","--type")):
      sub_type = arg
      if(sub_type not in ('word', 'scan')):
        usage()
        sys.exit()
        pass
      pass
    elif(opt in ("-m","--limit")):
      words_limit= int(arg)
      #print words_limit
      _is_words_show=True
      pass
    elif(opt == '-l'):
      #show lines
      _is_lines_show=True
      pass
    elif(opt == '-W'):
      #show words
      _is_words_show=True
      pass

  """
  if(len(sys.argv)<2):
    print "need args!!"
    logger.error("need args!!sys.argv:{0}".format(sys.argv))
    return None
    pass
  """
  #print sys.argv

  #sub.addPunctuation([',','!',';','.',':','>','<'])
  #sub.addLexicon(["hello", "world"])

  if sub.lexicon_path is None:
    sub.setLexiconFile("lexicon.pickle")
  sub.loadOldData()

  sub.addFiles(args)
  #sub.addStrings("hello world, I'm wang. Please call me wang.")

  sub.parse()

  if(_is_lines_show):
    sub.lines_show()
    pass

  if(_is_words_show):
    #print words_limit
    sub.words_show(words_limit)
    pass
  sub.show()

  if(sub_type =='word'):
    sub.dumpData()

  print 
  endDtime = datetime.now()
  print "End time: "+str(endDtime)
  timedelta  = endDtime-startDtime
  print "Cost time: "+str(timedelta) 

  #getChecksum(sys.argv[1])
  pass