def do_train(args): # Load configuration config = ConfigParser() config.read_file(args.config) data = DataStore(config) # Create the CRF model. model = CRF(config) retrain_epochs = config["training"].getint("retrain_every") accuracy = [] with EditShell(config) as shell: while data.has_next(): conll = data.next() i = data.i() # if the data doesn't have tags, try to smart-tag them. if len(conll[0]) == DataStore.TAG_LABEL+1: tags = [tok[DataStore.TAG_LABEL] for tok in conll] else: tags = model.infer(conll) try: #conll_display = ["{}/{}".format(token[0], token[2]) for token in conll] conll_display = ["{}".format(token[0]) for token in conll] # Create a copy of the list action = shell.run(conll_display, list(tags), metadata=render_progress(data, accuracy)) if action.type == ":prev": try: data.rewind(2) # move 2 indices back except AttributeError: data.rewind(1) elif action.type == ":goto": doc_idx, = action.args assert doc_idx >= 0 data.goto(doc_idx) elif action.type == "save": _, tags_ = action.args accuracy.append(score(tags, tags_)) data.update(conll, tags_) if i % retrain_epochs == 0: model.retrain() except QuitException: break
class YTPlaylist: def __init__(self, yt_resource: Resource, playlist_id) -> None: """ :param yt_resource: googleapiclient.discovery.Resource object :param playlist_id: youtube playlist ID """ self._yt_resource = yt_resource self._playlist_id = playlist_id self._videos_store = DataStore() def fetch_videos(self): with alive_bar(manual=True, bar='smooth', spinner='dots_reverse') as bar: nextPageToken = None page = 1 while True: pl_item_request = self._yt_resource.playlistItems().list( part='snippet', playlistId=self._playlist_id, pageToken=nextPageToken) pl_item_response = pl_item_request.execute() for item in pl_item_response['items']: self._videos_store.update(title=item['snippet']['title'], id='#') bar(page / ceil(pl_item_response['pageInfo']['totalResults'] / pl_item_response['pageInfo']['resultsPerPage'])) page += 1 nextPageToken = pl_item_response.get('nextPageToken') if not nextPageToken: break print() def print_videos(self): print_heading('<< VIDEOS IN THE PLAYLIST >>') self._videos_store.print() def get_videos_serial(self): # {Title: serial_number} video_serial = {} i = 1 for cache_unit in self._videos_store.list(): video_serial.update({cache_unit['title']: i}) i += 1 return video_serial
class YTChannel: def __init__(self, yt_resource: Resource, channel_id) -> None: """ :param yt_resource: googleapiclient.discovery.Resource object :param channel_id: youtube channel ID """ self._yt_resource = yt_resource self._channel_id = channel_id self._playlist_store = DataStore() @property def total_playlists(self): return self._playlist_store.len def fetch_playlists(self): with alive_bar(manual=True, bar='smooth', spinner='dots_reverse') as bar: nextPageToken = None page = 1 while True: pl_request = self._yt_resource.playlists().list( part='snippet', channelId=self._channel_id, pageToken=nextPageToken) pl_response = pl_request.execute() for item in pl_response['items']: self._playlist_store.update(title=item['snippet']['title'], id=item['id']) bar(page / ceil(pl_response['pageInfo']['totalResults'] / pl_response['pageInfo']['resultsPerPage'])) page += 1 nextPageToken = pl_response.get('nextPageToken') if not nextPageToken: break print() def print_playlists(self): print_heading('<< PLAYLISTS >>') self._playlist_store.print() def select_playlist(self, playlist_no): return self._playlist_store.list()[playlist_no - 1]
class Youtube: def __init__(self, yt_resource: Resource) -> None: """ :param yt_resource: googleapiclient.discovery.build object """ self._yt_resource = yt_resource self._channel_store = DataStore() self._subscribers = [] @property def total_channels(self): return self._channel_store.len def search_channel(self, query: str) -> list: yt_request = self._yt_resource.search().list(part='snippet', type='channel', maxResults=9, q=query) yt_response = yt_request.execute() for item in yt_response['items']: self._channel_store.update(title=item['snippet']['title'], id=item['snippet']['channelId']) def print_channels(self): channel_output = capture_stdout(lambda: self._channel_store.print()) self._fetch_subscribers_of_all() print_heading('# Channel (Subscribers)') for index, out in enumerate(channel_output): print(f'{out} ({self._subscribers[index]})') def select_channel(self, channel_no: int): return self._channel_store.list()[channel_no - 1] def subscriber_count(self, channel_id): def shorten_M_K(subs: str): length = len(str(subs)) subs = int(subs) if length > 6: subs /= 10**6 subs = str(subs) if length >= 9: subs = subs[:length - 6] + 'M' else: subs = subs[:4] + 'M' elif length > 3: subs /= 10**3 subs = str(subs) if length == 6: subs = subs[:3] + 'K' else: subs = subs[:4] + 'K' return subs # self._yt_resource is not working if used directly in multi-threading. Problem not known. # Error: malloc(): unsorted double linked list corrupted # Using deepcopy it works fine yt_response = deepcopy(self._yt_resource).channels().list( part='statistics', id=channel_id, ).execute() if yt_response['items'][0]['statistics']['hiddenSubscriberCount']: return return shorten_M_K( yt_response['items'][0]['statistics']['subscriberCount']) def _fetch_subscribers_of_all(self): def get_subs(channel_id, index, progress_bar): subs = self.subscriber_count(channel_id) if not subs: subs = 'Hidden' self._subscribers[index] = subs progress_bar() self._subscribers = [None] * self._channel_store.len with alive_bar(total=len(self._subscribers), bar='smooth', spinner='dots_reverse') as bar: threads = [] for index, channel_cache_unit in enumerate( self._channel_store.list()): thread = Thread(target=get_subs, args=(channel_cache_unit['id'], index, bar)) threads.append(thread) thread.start() for thread in threads: thread.join()
class CacheManager: def __init__(self, config) -> None: self.__local_playlist_cache_path = os.path.join( config['local_cache'], 'playlist.json') self.__local_channel_cache_path = os.path.join(config['local_cache'], 'channel.json') self.__shared_channel_cache_path = os.path.join( config['shared_cache'], 'channel.json') self.__indent = config['cache_indent'] self._local_playlist_cache = DataStore() self._local_channel_cache = DataStore() self._shared_channel_cache = DataStore() self.create_folder(config['local_cache']) self.create_folder(config['shared_cache']) self.load() @property def local_playlist_cache(self): return self._local_playlist_cache @property def local_channel_cache(self): return self._local_channel_cache @property def shared_channel_cache(self): return self._shared_channel_cache @staticmethod def _create_cache(cache_unit: dict): local_cache = DataStore() local_cache.update(cache_unit['title'], cache_unit['id']) return local_cache def update_playlist_cache(self, playlist_cache_unit: dict): self.update_local_playlist_cache(playlist_cache_unit) def update_channel_cache(self, channel_cache_unit: dict): self.update_local_channel_cache(channel_cache_unit) self.update_shared_channel_cache(channel_cache_unit) def update_local_playlist_cache(self, playlist_cache_unit: dict): self._local_playlist_cache = self._create_cache(playlist_cache_unit) def update_local_channel_cache(self, channel_cache_unit: dict): self._local_channel_cache = self._create_cache(channel_cache_unit) def update_shared_channel_cache(self, channel_cache_unit: dict): self._shared_channel_cache.update(channel_cache_unit['title'], channel_cache_unit['id']) def load(self): self.load_local_playlist_cache() self.load_local_channel_cache() self.load_shared_channel_cache() def load_local_playlist_cache(self): r = DataStore.retrieve_from_file(self.__local_playlist_cache_path) if r: self._local_playlist_cache.load(r) def load_local_channel_cache(self): r = DataStore.retrieve_from_file(self.__local_channel_cache_path) if r: self._local_channel_cache.load(r) def load_shared_channel_cache(self): r = DataStore.retrieve_from_file(self.__shared_channel_cache_path) if r: self._shared_channel_cache.load(r) @staticmethod def create_folder(folder): if not os.path.exists(folder): os.mkdir(folder) def is_local_playlist_cache_available(self): return True if self._local_playlist_cache.len > 0 else False def is_local_channel_cache_available(self): return True if self._local_channel_cache.len > 0 else False def is_shared_channel_cache_available(self): return True if self._shared_channel_cache.len > 0 else False def _dump(self, path, cache_list: list): with open(path, 'w') as f: json.dump(cache_list, f, indent=self.__indent) def save(self): self.save_local_playlist_cache() self.save_local_channel_cache() self.save_shared_channel_cache() def save_local_playlist_cache(self): self._dump(self.__local_playlist_cache_path, self.local_playlist_cache.list()) def save_local_channel_cache(self): self._dump(self.__local_channel_cache_path, self.local_channel_cache.list()) def save_shared_channel_cache(self): self._dump(self.__shared_channel_cache_path, self.shared_channel_cache.list()) def delete_local_playlist_cache(self): self._local_playlist_cache.purge() try: os.remove(self.__local_playlist_cache_path) except: pass def delete_local_channel_cache(self): self._local_channel_cache.purge() try: os.remove(self.__local_channel_cache_path) except: pass
def _create_cache(cache_unit: dict): local_cache = DataStore() local_cache.update(cache_unit['title'], cache_unit['id']) return local_cache