def test_html_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() #sub_scan.addFile('https://www.coursera.org/') sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32) pass
def test_txt_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518) ''' assert len(sub_scan.lexicon) == 3929 assert len(sub_scan.stem_lexicon) == 2968 assert len(sub_scan.wordSet) <= 1807 assert len(sub_scan.stem_newWords) <= 922 assert len(sub_scan.newWords) <= 531 ''' names_mv=['Yuri', 'Simeon'] for n in names_mv: assert n in sub_scan.nameSet ''' if n in sub_scan.nameSet: print n; pass ''' pass #print sub_scan.nameSet for n in names_mv: ''' if n.lower() in sub_scan.newWords: print n; pass ''' assert n.lower() not in sub_scan.newWords pass pass
def main(argv=None, logger=None): if(logger is None): logger=createLog(logname="subtitle",level=logging.INFO) fname=None startDtime=datetime.now() print "Start time: "+str(startDtime)#.strftime("%Y-%m-%d %H:%M:%S")) print #sub=Subtitle(logging.getLogger()) sub=Subtitle(logger) try: opts, args=getopt.getopt( argv, "hvf:w:t:d:p:?lm:WDc", ["help", "version", "checkup" "file=","word=","type=","dir=","pickle=","limit="]) #print opts, args logger.info("opts:{0};args:{1}".format(opts, args)) except getopt.GetoptError as msg: print "error happened when get options!!! error:{0}".format(msg) usage() logger.error("getopt.GetoptError:{0}, exit!".format(msg)) sys.exit(2) except Exception as msg: logger.error("error:{0}, exit!".format(msg)) sys.exit(2) _is_lines_show=False _is_words_show=False sub_type = "" words_limit=None for opt, arg in opts: if(opt in ("-?","-h", "--help")): usage() sys.exit() pass elif(opt in ("-v", "--version")): version() sys.exit() pass elif(opt in ("-c", "--checkup")): sub.checkup=True pass elif(opt in ("-d", "--dir")): print "Sorry, -d --dir option still not offer" sys.exit() pass elif(opt in ("-p", "--pickle")): pkl=arg sub.setLexiconFile(pkl) pass elif(opt in ('-f',"--file")): fname= arg sub.addFile(fname) pass elif(opt == '-D'): logger.setLevel(logging.DEBUG) sub.setLogger(logger) pass elif(opt in ("-w", "--word")): word = arg sub.addWord(word) pass elif(opt in ("-t","--type")): sub_type = arg if(sub_type not in ('word', 'scan')): usage() sys.exit() pass pass elif(opt in ("-m","--limit")): words_limit= int(arg) #print words_limit _is_words_show=True pass elif(opt == '-l'): #show lines _is_lines_show=True pass elif(opt == '-W'): #show words _is_words_show=True pass """ if(len(sys.argv)<2): print "need args!!" logger.error("need args!!sys.argv:{0}".format(sys.argv)) return None pass """ #print sys.argv #sub.addPunctuation([',','!',';','.',':','>','<']) #sub.addLexicon(["hello", "world"]) if sub.lexicon_path is None: sub.setLexiconFile("lexicon.pickle") sub.loadOldData() sub.addFiles(args) #sub.addStrings("hello world, I'm wang. Please call me wang.") sub.parse() if(_is_lines_show): sub.lines_show() pass if(_is_words_show): #print words_limit sub.words_show(words_limit) pass sub.show() if(sub_type =='word'): sub.dumpData() print endDtime = datetime.now() print "End time: "+str(endDtime) timedelta = endDtime-startDtime print "Cost time: "+str(timedelta) #getChecksum(sys.argv[1]) pass