def downloadAllPagesVideos(url):
    global proxy, host, thread_count
    print url
    content = getContent(url, None, proxy)
    
    all_page_content = ''
    matched_groups = re.findall('''<a href="(.*?)" title='第\d+页' charset=".*?">\d+</a>''', content)
    for matched in matched_groups:
        page_url = 'http://so.youku.com'+matched.strip()
        all_page_content += getContent(page_url, None, proxy)
    
    
    pool = ThreadPool(thread_count)

    video_url_set = set()
    matched_groups = re.findall('''<a href="(http\://v\.youku\.com/v_show/id_.*?=\.html)"''', all_page_content)
    for matched in matched_groups:
        #print matched.strip()
        video_url = matched.strip()
        video_url_set.add(video_url)

    for video_url in video_url_set:
        print video_url
        log(video_url)
        pool.queueTask(downloadVideo, (video_url))

    pool.joinAll()
Ejemplo n.º 2
0
def convertFlv2Mp4underDir(path):
    if not os.path.isdir(path):
        if os.path.exists(path):
            print "  Path:["+ path+ "] is not a directory, exit!\n"
            return
        else:
            os.makedirs(path)
    
    pool = ThreadPool(6)
    
    MP4_CMD = '''D:\\Program\\tools\\ffmpeg.exe -i "%s" -vcodec mpeg4 -b 1200kb -mbd 2 -aic 2 -cmp 2 -subcmp 2 -acodec libfaac -ac 2 -ab 128000 -y "%s"'''
    MP3_CMD = '''D:\\Program\\tools\\ffmpeg.exe -i "%s" -vn -ar 44100 -ac 2 -f mp3 "%s"'''
    for file_name in os.listdir(path):
        flv_path = path+'\\'+file_name
        if os.path.isfile(flv_path):
            mp4_file_name = file_name[:file_name.rfind('.')]+'.mp4'
            mp4_save_path = path+'\\mp4\\'+mp4_file_name
            if os.path.exists(mp4_save_path):
                print "  File:[" + mp4_save_path+ "] already exists, pass.\n"
            else:
                cmd = MP4_CMD%(flv_path, mp4_save_path)
                #print cmd
                #pool.queueTask(run_cmd, (cmd))
            
            mp3_file_name = file_name[:file_name.rfind('.')]+'.mp3'
            mp3_save_path = path+'\\mp3\\'+mp3_file_name
            if os.path.exists(mp3_save_path):
                print "  File:[" + mp3_save_path+ "] already exists, pass.\n"
            else:
                cmd = MP3_CMD%(flv_path, mp3_save_path)
                print cmd
                pool.queueTask(run_cmd, (cmd))
            
    pool.joinAll()
Ejemplo n.º 3
0
def convertWMA2MP3underDir(path):
    if not os.path.isdir(path):
        if existFile(path):
            print "  Path:["+ path+ "] is not a directory, exit!\n"
            return
        else:
            os.makedirs(path)
    
    pool = ThreadPool(6)
    
    MP3_CMD = '''ffmpeg.exe -i "%s" -f mp3 "%s"'''
    DEL_CMD = '''del %s'''
    for file_name in os.listdir(path):
        wma_path = path+'\\'+file_name
        if os.path.isfile(wma_path) and wma_path.lower().endswith('.wma'):
            mp3_file_name = file_name[:file_name.rfind('.')]+'.mp3'
            mp3_save_path = path+'\\'+mp3_file_name
            if os.path.exists(mp3_save_path):
                print "  File:[" + mp3_save_path+ "] already exists, pass.\n"
            else:
                cmd1 = MP3_CMD%(wma_path, mp3_save_path)
                #cmd2 = DEL_CMD%(wma_path)
                print cmd1
                pool.queueTask(run_cmd, (cmd1))
        
    pool.joinAll()
Ejemplo n.º 4
0
def getSongsFromHTML(htmlcontent, save_path):
    global thread_count

    pool = ThreadPool(thread_count)

    matched_groups = re.findall("""W[LS]\("(\d+)",\s*"(\d+)",\s*"(.*?)\s+",""", htmlcontent)
    for matched in matched_groups:
        print "-" * 2, matched
        order = matched[0].strip()
        song_id = matched[1].strip()
        song_name = matched[2].strip()
        # getSong(song_id, order, save_path)
        pool.queueTask(getSongThread, (song_id, order, save_path))

    pool.joinAll()
Ejemplo n.º 5
0
def aggregate_all(client, iterator, connection_factory):
    """
    Aggregate all feeds returned by the generator.

    The generator should contain pairs of two elements (feed_url, categories)
    """

    def attach_connection(thread):
        thread.hbase = connection_factory()
        return thread

    pool = ThreadPool(10, thread_init=attach_connection)
    for feed, categs in iterator:
        pool.queueTask(lambda worker, p: aggregate(worker.hbase, *p), (feed, categs))
    pool.joinAll()
Ejemplo n.º 6
0
def downloadFirstVideo(url):
    global proxy, host, thread_count
    print url
    htmlcontent = getContent(url, None, proxy)

    pool = ThreadPool(thread_count)

    matched_groups = re.findall('''class=list>(.*?)</a>&nbsp;<a title=".*?" href="http://www.cctv.com/video/(.*?).shtml" target="_blank">''', htmlcontent)
    for matched in matched_groups:
        #print matched.strip()
        video_title = matched[0].strip()
        video_url = matched[1].strip()
        video_url = 'http://v.cctv.com/flash/'+video_url+'.flv'
        print video_title, '-', video_url
        log(video_url)
        pool.queueTask(downloadVideoThread, (video_url, video_title))
        break

    pool.joinAll()
Ejemplo n.º 7
0
def downloadSpaceVideos(url):
    global proxy, host, thread_count
    print url
    htmlcontent = getContent(url, None, proxy)

    pool = ThreadPool(thread_count)

    #video_url_set = set()
    matched_groups = re.findall('''src="(.*?)" alt=".*?" title="(.*?)"/>''', htmlcontent)
    for matched in matched_groups:
        #print matched.strip()
        video_title = matched[1].strip()
        video_url = matched[0].strip()
        video_url = video_url.replace('image', 'flash').replace('jpg', 'flv')
        #video_url_set.add((video_url)

        print video_title, '-', video_url
        log(video_url)
        pool.queueTask(downloadVideoThread, (video_url, video_title))
    pool.joinAll()
def downloadAllVideos(url):
    global proxy, host, thread_count, pool
    
    print url
    htmlcontent = getContent(url, None, proxy)
    
    pool = ThreadPool(thread_count)

    video_url_set = set()
    matched_groups = re.findall('''<a href="(http\://v\.youku\.com/v_show/id_.*?=\.html)"''', htmlcontent)
    for matched in matched_groups:
        #print matched.strip()
        video_url = matched.strip()
        video_url_set.add(video_url)

    for video_url in video_url_set:
        print video_url
        log(video_url)
        pool.queueTask(downloadVideo, (video_url))

    pool.joinAll()
class FilesystemMonitor(object):
    """
    FileMonitor Class keeps track of all files down a tree starting at the root
    """

    def __init__(self, searcher):
        self.searcher = searcher
        
        self._thread_pool = ThreadPool(THREAD_POOL_WORKS)

        # Add a watch to the root of the dir
        self.watch_manager = WatchManager()
        self.notifier = ThreadedNotifier(self.watch_manager, FileProcessEvent(self))
        self.notifier.start()

        self._build_exclude_list()


    def _build_exclude_list(self):
        log.info("[FileMonitor] Set Regexs for Ignore List")

        self._exclude_regexs = []
        # Complie Ignore list in to a list of regexs
        for ignore in self.searcher.configuration.get_value("EXCLUDE_LIST"):
            ignore = ignore.strip()
            ignore = ignore.replace(".", "\.")
            ignore = ignore.replace("*", ".*")
            ignore = "^"+ignore+"$"
            log.debug("[FileMonitor] Ignore Regex = %s" % ignore)
            self._exclude_regexs.append(re.compile(ignore))

    def change_root(self, previous_root):
        self._thread_pool.clearTasks()

        wd = self.watch_manager.get_wd(previous_root)
        if wd:
          self.watch_manager.rm_watch(wd, rec=True)

        self.searcher.clear_database()
        self.add_directory(self.searcher.current_root)

    def add_directory(self, path):
        """
        Starts a WalkDirectoryThread to add the directory
        """
        basename = os.path.basename(path)
        if self.validate(basename):
            self.watch_manager.add_watch(path, EVENT_MASK)
            self._thread_pool.queueTask(self.walk_directory, path)

    def add_file(self, path, name):
        """
        Add a single file to the databse
        """
        if self.validate(name):
            self.searcher.add_file(path, name)

    def remove_file(self, path, name):
        self.searcher.remove_file(path, name)

    def remove_directory(self, path):
        self.searcher.remove_directory(path)

    def walk_directory(self, root):
        """
        From a give root of a tree this method will walk through ever branch
        and return a generator.
        """
        if os.path.isdir(root):
            names = os.listdir(root)
            for name in names:
                try:
                    file_stat = os.lstat(os.path.join(root, name))
                except os.error:
                    continue

                if stat.S_ISDIR(file_stat.st_mode):
                    self.add_directory(os.path.join(root, name))
                else:
                    if not stat.S_ISLNK(file_stat.st_mode):
                        self.add_file(root, name)
    def finish(self):
        wd = self.watch_manager.get_wd(self.searcher.current_root)
        self.watch_manager.rm_watch(wd, rec=True)
        self.notifier.stop()
        self._thread_pool.joinAll(waitForTasks=False)

    def validate(self, name):
         # Check to make sure the file not in the ignore list
        for ignore_re in self._exclude_regexs:
            if ignore_re.match(name):
                log.debug("[WalkDirectoryThread] ##### Ignored %s #####", name)
                return False
        log.debug("[WalkDirectoryThread] # Passed %s", name)
        return True
Ejemplo n.º 10
0
 
 t_pool = ThreadPool(opts.thread)
 
 ''' Read Sample Files (Concurrent by sample)'''
 for m, sample_arg in enumerate(args):
     replist = sample_arg.split(',')
     for n, replicate_file in enumerate(replist):
         if not os.path.exists(replicate_file):
             sys.stderr.write('%d th replicate file of %d th sample (%s) doesn\'t exist' % (n+1,m+1,replicate_file))
             sys.exit(1)
     sample_list.append(SampleData(replist))
     
 for m, sample_data in enumerate(sample_list):
     t_pool.queueTask(preprocess_samples,sample_data,None)
 
 t_pool.joinAll()
 
 if DEBUG:
     print "print chromosome order"
     for chrom_name in sample_data.chrom_order:
         print chrom_name
 
 ''' debug purpose '''
 if DEBUG:
     for sample_data in sample_list:
         sample_data.output_debug_info()
 
 ''' Smoothing'''
 if opts.smooth:
     for sample_data in sample_list:
         smooth_data((sample_data,opts.smooth_window))
Ejemplo n.º 11
0
class FilesystemMonitor(object):
    """
    FileMonitor Class keeps track of all files down a tree starting at the root
    """
    def __init__(self, searcher):
        self.searcher = searcher

        self._thread_pool = ThreadPool(THREAD_POOL_WORKS)

        # Add a watch to the root of the dir
        self.watch_manager = WatchManager()
        self.notifier = ThreadedNotifier(self.watch_manager,
                                         FileProcessEvent(self))
        self.notifier.start()

        self._build_exclude_list()

    def _build_exclude_list(self):
        log.info("[FileMonitor] Set Regexs for Ignore List")

        self._exclude_regexs = []
        # Complie Ignore list in to a list of regexs
        for ignore in self.searcher.configuration.exclude_list:
            ignore = ignore.strip()
            ignore = ignore.replace(".", "\.")
            ignore = ignore.replace("*", ".*")
            ignore = "^" + ignore + "$"
            log.debug("[FileMonitor] Ignore Regex = %s" % ignore)
            self._exclude_regexs.append(re.compile(ignore))

    def change_root(self, previous_root):
        self._thread_pool.clearTasks()

        wd = self.watch_manager.get_wd(previous_root)
        if wd:
            self.watch_manager.rm_watch(wd, rec=True)

        self.searcher.clear_database()
        self.add_directory(self.searcher.current_root)

    def add_directory(self, path):
        """
        Starts a WalkDirectoryThread to add the directory
        """
        basename = os.path.basename(path)
        if self.validate(basename):
            self.watch_manager.add_watch(path, EVENT_MASK)
            self._thread_pool.queueTask(self.walk_directory, path)

    def add_file(self, path, name):
        """
        Add a single file to the databse
        """
        if self.validate(name):
            self.searcher.add_file(path, name)

    def remove_file(self, path, name):
        self.searcher.remove_file(path, name)

    def remove_directory(self, path):
        self.searcher.remove_directory(path)

    def walk_directory(self, root):
        """
        From a give root of a tree this method will walk through ever branch
        and return a generator.
        """
        if os.path.isdir(root):
            names = os.listdir(root)
            for name in names:
                try:
                    file_stat = os.lstat(os.path.join(root, name))
                except os.error:
                    continue

                if stat.S_ISDIR(file_stat.st_mode):
                    self.add_directory(os.path.join(root, name))
                else:
                    if not stat.S_ISLNK(file_stat.st_mode):
                        self.add_file(root, name)

    def finish(self):
        wd = self.watch_manager.get_wd(self.searcher.current_root)
        self.watch_manager.rm_watch(wd, rec=True)
        self.notifier.stop()
        self._thread_pool.joinAll(waitForTasks=False)

    def validate(self, name):
        # Check to make sure the file not in the ignore list
        for ignore_re in self._exclude_regexs:
            if ignore_re.match(name):
                log.debug("[WalkDirectoryThread] ##### Ignored %s #####", name)
                return False
        log.debug("[WalkDirectoryThread] # Passed %s", name)
        return True