def load_data(data_type, train_on_cv=False, cv_random_state=42, cv_fold=5, cv_index=0): if data_type == 'train': if train_on_cv: data = pickle_load( format_filename(PROCESSED_DATA_DIR, TRAIN_CV_DATA_TEMPLATE, random=cv_random_state, fold=cv_fold, index=cv_index)) else: data = pickle_load( format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_FILENAME)) elif data_type == 'dev': if train_on_cv: data = pickle_load( format_filename(PROCESSED_DATA_DIR, DEV_CV_DATA_TEMPLATE, random=cv_random_state, fold=cv_fold, index=cv_index)) else: data = pickle_load( format_filename(PROCESSED_DATA_DIR, DEV_DATA_FILENAME)) elif data_type == 'test': data = pickle_load( format_filename(PROCESSED_DATA_DIR, TEST_DATA_FILENAME)) else: raise ValueError('data type not understood: {}'.format(data_type)) return data
def process_data(dataset: str, neighbor_sample_size: int, K: int): drug_vocab = {} entity_vocab = {} relation_vocab = {} read_entity2id_file(ENTITY2ID_FILE[dataset], drug_vocab, entity_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, DRUG_VOCAB_TEMPLATE, dataset=dataset), drug_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset), entity_vocab) examples_file = format_filename(PROCESSED_DATA_DIR, DRUG_EXAMPLE, dataset=dataset) examples = read_example_file(EXAMPLE_FILE[dataset], SEPARATOR[dataset], drug_vocab) print(len(examples)) #example contains postive samples and negative samples #example:[drug1 drug2 interaction] np.save(examples_file, examples) adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset) adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset) adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab, relation_vocab, neighbor_sample_size) pickle_dump( format_filename(PROCESSED_DATA_DIR, DRUG_VOCAB_TEMPLATE, dataset=dataset), drug_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset), entity_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, RELATION_VOCAB_TEMPLATE, dataset=dataset), relation_vocab) adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset) np.save(adj_entity_file, adj_entity) print('Logging Info - Saved:', adj_entity_file) adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset) np.save(adj_relation_file, adj_relation) print('Logging Info - Saved:', adj_entity_file) cross_validation(K, examples, dataset, neighbor_sample_size)
def load_idx2cate(): idx2cate1 = pickle_load( format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, level='cate1')) idx2cate2 = pickle_load( format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, level='cate2')) idx2cate3 = pickle_load( format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, level='cate3')) return idx2cate1, idx2cate2, idx2cate3
def get(self, url): print_ = get_print(self.cw) if self._url: return self._url ydl = ytdl.YoutubeDL(cw=self.cw) try: info = ydl.extract_info(url) except Exception as e: ex = type(ytdl.get_extractor(url))(ydl) _download_info = getattr(ex, '_download_info', None) if _download_info is not None: vod_id = ex._match_id(url) info = _download_info(vod_id) print_(info) raise video_best = info['formats'][-1] video = video_best['url'] ext = get_ext(video) self.title = info['title'] id = info['display_id'] if ext.lower() == '.m3u8': video = M3u8_stream(video, n_thread=4, alter=alter) ext = '.mp4' self.filename = format_filename(self.title, id, ext) self.url_thumb = info['thumbnail'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) self._url = video return self._url
def get(self, url): if self._url: return self._url ydl = ytdl.YoutubeDL() info = ydl.extract_info(url) # get best video fs = info['formats'] fs = sorted(fs, key=lambda x: int(x['width']), reverse=True) f = fs[0] url_video = f['url'] # thumb self.thumb_url = info['thumbnails'][0]['url'] self.thumb = BytesIO() downloader.download(self.thumb_url, buffer=self.thumb) # m3u8 print(f['protocol']) if 'm3u8' in f['protocol']: url_video = M3u8_stream(url_video, referer=url) # title & filename self.title = info['title'] self.filename = format_filename(self.title, info['id'], '.mp4') self._url = url_video return self._url
def read_album(url, session=None): ''' read_album ''' soup = downloader.read_soup(url, session=session) id_album = re.find('/album/([0-9]+)', url, err='no album id') url_json = 'https://www.pornhub.com/album/show_album_json?album={}'.format(id_album) data = downloader.read_json(url_json, url, session=session) block = soup.find('div', class_='photoAlbumListBlock') href = block.a.attrs['href'] id_ = re.find('/photo/([0-9]+)', href, err='no photo id') ids = [id_] while True: item = data[id_] id_ = item['next'] if id_ in ids: break ids.append(id_) photos = [] for id_ in ids: item = data[id_] img = item['img_large'] referer = 'https://www.pornhub.com/photo/{}'.format(id_) photo = Photo(id_, img, referer) photos.append(photo) info = {} title = clean_title(soup.find('h1', class_='photoAlbumTitleV2').text) info['title'] = format_filename(title, 'album_{}'.format(id_album)) info['photos'] = photos return info
def __init__(self, f, f_audio, info, session, referer, cw=None): self.f_audio = f_audio self.cw = cw self.title = title = info['title'] self.id = info['id'] self.url = f['url'] self.artist = info.get('uploader') self.header = utils.capitalize(get_ie_key(info)) self.session = session self.referer = referer self.url_thumb = info.get('thumbnail') self.thumb = BytesIO() if self.url_thumb: downloader.download(self.url_thumb, referer=referer, buffer=self.thumb, session=session) ext = get_ext_(self.url, session, referer) if not ext: print('empty ext') if f['_resolution']: ext = '.mp4' else: ext = '.mp3' if ext.lower() == '.m3u8': try: url = playlist2stream(self.url, referer, session=session, n_thread=4) except: url = M3u8_stream(self.url, referer=referer, session=session, n_thread=4) ext = '.mp4' else: url = self.url self.url = LazyUrl(referer, lambda x: url, self, pp=self.pp) self.filename = format_filename(title, self.id, ext, header=self.header)
def register_sensor(self): with open('./examples/sensor_example', 'r') as content_file: j_sensor = json.load(content_file) self.app.post('/register', data=json.dumps(j_sensor)) sensor = models.Sensor.query.get(1) assert sensor.id == 1 assert j_sensor['name'] in sensor.name assert j_sensor['type'] in sensor.type assert j_sensor['location'] == sensor.location assert sensor.alive == 1 assert sensor.events.count() == 0 assert isfile(views.schema_prefix + format_filename(sensor.type) + views.schema_suffix) with open(views.schema_prefix + format_filename(sensor.type) + views.schema_suffix, 'r') as content_file: assert json.dumps(j_sensor['event_definition']) in content_file.read()
def __init__(self, stream, referer, id, title, url_thumb): self.url = LazyUrl(referer, lambda x: stream, self) self.id = id self.title = title self.filename = format_filename(title, id, '.mp4') self.url_thumb = url_thumb self.thumb = BytesIO() downloader.download(url_thumb, buffer=self.thumb)
def __init__(self, url, url_thumb, referer, title, id, session): self.title = title self.filename = format_filename(title, id, '.mp4') self.url = LazyUrl(referer, lambda x: url, self) self.thumb = BytesIO() self.url_thumb = url_thumb downloader.download(url_thumb, buffer=self.thumb, session=session)
def load_data(dataset: str, data_type: str): if data_type == 'train': return np.load( format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, dataset=dataset)) elif data_type == 'dev': return np.load( format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, dataset=dataset)) elif data_type == 'test': return np.load( format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, dataset=dataset)) else: raise ValueError('`data_type` not understood: {data_type}')
def cv_split(train_data, dev_data, cate3_vocab, fold=5, balanced=True, random_state=42): def indexing_data(data, indices): part_data = {} for k in data.keys(): part_data[k] = [data[k][i] for i in indices] return part_data all_data = {} for key in train_data.keys(): all_data[key] = train_data[key] + dev_data[key] # some category in validation set is not in cate3_vocab cate3_id_list = [cate3_vocab.get(cate3, 0) for cate3 in all_data['cate3']] index_range = np.arange(len(all_data['id'])) if balanced: kf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=random_state) else: kf = KFold(n_splits=fold, shuffle=True, random_state=random_state) for idx, (train_index, dev_index) in enumerate(kf.split(index_range, cate3_id_list)): train_data_fold = indexing_data(all_data, train_index) dev_data_fold = indexing_data(all_data, dev_index) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_CV_DATA_TEMPLATE, random=random_state, fold=fold, index=idx), train_data_fold) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_CV_DATA_TEMPLATE, random=random_state, fold=fold, index=idx), dev_data_fold)
def get(self, url): print_ = get_print(self.cw) if self._url: return self._url ydl = ytdl.YoutubeDL(cw=self.cw) try: info = ydl.extract_info(url) except Exception as e: ex = type(ytdl.get_extractor(url))(ydl) _download_info = getattr(ex, '_download_info', None) if _download_info is not None: vod_id = ex._match_id(url) info = _download_info(vod_id) print_(info) if 'HTTPError 403' in str(e): raise errors.LoginRequired() raise def print_video(video): print_('[{}] [{}] [{}] {}'.format(video['format_id'], video.get('height'), video.get('tbr'), video['url'])) videos = [video for video in info['formats'] if video.get('height')] videos = sorted(videos, key=lambda video: (video.get('height', 0), video.get('tbr', 0)), reverse=True) for video in videos: print_video(video) for video in videos: if video.get('height', 0) <= get_resolution(): #3723 video_best = video break else: video_best = videos[-1] print_video(video) video = video_best['url'] ext = get_ext(video) self.title = info['title'] id = info['display_id'] if ext.lower() == '.m3u8': video = M3u8_stream(video, n_thread=4, alter=alter) ext = '.mp4' self.filename = format_filename(self.title, id, ext) self.url_thumb = info['thumbnail'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) self._url = video return self._url
def __init__(self, url, url_page, title, url_thumb): self._url = url self.url = LazyUrl(url_page, self.get, self) self.id = get_id(url_page) self.title = title self.filename = format_filename(title, self.id, '.mp4') f = IO() self.url_thumb = url_thumb downloader.download(url_thumb, buffer=f) self.thumb = f
def unregister_sensor(self): with open('./examples/sensor_id_example', 'r') as content_file: j_sensor = json.load(content_file) sensor_type = models.Sensor.query.get(j_sensor['id']).type self.app.post('/unregister', data=json.dumps(j_sensor)) assert db.session.query(models.Sensor).count() == 0 assert db.session.query(models.Event).count() > 0 assert not isfile(views.schema_prefix + format_filename(sensor_type) + views.schema_suffix)
def print(self, file_name, file_content): new_file_name = format_filename(file_name) full_file_path = os.path.join(self.print_folder, new_file_name) # Save file content to a file with open(full_file_path, "wb") as file: file.write(file_content) # Print file (option "-r" specifies that the file will be deleted after being submitted print("Print file {}".format(file_name)) subprocess.Popen(["/usr/bin/lpr", full_file_path, "-r"])
def get(self, url): if self._url: return self._url self.info = get_info(url) self.title = self.info['title'] id = self.info['id'] video_best = self.info['formats'][(-1)] self._url = video_best['url'] ext = get_ext(self._url) self.filename = format_filename(self.title, id, ext) return self._url
def get(self, url): print_ = get_print(self.cw) if self._url: return self._url info = extract_info(url, self.cw) def print_video(video): print_(video) # print_('{}[{}] [{}] [{}] {}'.format('LIVE ', video['format_id'], video.get('height'), video.get('tbr'), video['url'])) videos = [video for video in info['formats'] if video.get('height')] videos = sorted(videos, key=lambda video: (video.get('height', 0), video.get('tbr', 0)), reverse=True) for video in videos: print_video(video) for video in videos: if video.get('height', 0) <= get_resolution(): #3723 video_best = video break else: video_best = videos[-1] print_video(video) video = video_best['url'] ext = get_ext(video) self.title = info['title'] id = info['display_id'] if self._live: video = utils.LiveStream(video, headers=video_best.get('http_headers')) ext = '.mp4' else: if ext.lower() == '.m3u8': video = M3u8_stream(video, n_thread=4, alter=alter) ext = '.mp4' self.filename = format_filename(self.title, id, ext) self.url_thumb = info['thumbnail'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) self._url = video return self._url
def get(self, url): if self._url_video: return self._url_video cw = self.cw print_ = get_print(cw) html = downloader.read_html(url) soup = Soup(html) embedUrl = extract('embedUrl', html, cw) if embedUrl: raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl)) uid = extract('strLocalChUserId', html, cw) pid = extract('nLocalPrgId', html, cw) fid = extract('strFid', html, cw) resolType = extract('strResolType', html, cw) resolArr = extract('strResolArr', html, cw) vodSvr = extract('nVodSvr', html, cw) resols = extract('nInfo', html, cw) runtime = extract('runtime', html, cw) url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/' data = { 'userId': uid, 'prgId': pid, 'fid': fid, 'resolType': resolType, 'resolArr': ','.join(map(str, resolArr)), 'vodSvr': vodSvr, 'resol': max(resols), 'runtime': runtime, 'tvbox': 'false', 'defResol': 'true', 'embed': 'false', } session = Session() r = session.post(url_api, headers={'Referer': url}, data=data) data = json.loads(r.text) self._url_video = data['src'] self.title = soup.find('meta', {'property': 'og:description'})['content'] ext = get_ext(self._url_video) self.filename = format_filename(self.title, pid, ext) self.url_thumb = soup.find('meta', {'property': 'og:image'})['content'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) return self._url_video
def __init__(self, session, info): self.session = session self.info = info self.url = info['url'] self.title = info['title'] self.ext = info['ext'] self.id = info['id'] self.fileName = format_filename(self.title, self.id, self.ext) self.url_thumb = info['thumbnail_url'] print('thumb:', self.url_thumb) self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb)
def __init__(self, url): ydl = ytdl.YoutubeDL() info = ydl.extract_info(url) f = info['formats'][-1] url_video = f['url'] self.url = LazyUrl(url, lambda _: url_video, self) self.url_thumb = info['thumbnails'][0]['url'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) self.title = info['title'] ext = get_ext(url_video) self.filename = format_filename(self.title, info['id'], ext)
def cross_validation(K_fold,examples,dataset,neighbor_sample_size): subsets=dict() n_subsets=int(len(examples)/K_fold) remain=set(range(0,len(examples)-1)) for i in reversed(range(0,K_fold-1)): subsets[i]=random.sample(remain,n_subsets) remain=remain.difference(subsets[i]) subsets[K_fold-1]=remain aggregator_types=['sum','concat','neigh'] for t in aggregator_types: count=1 temp={'dataset':dataset,'aggregator_type':t,'avg_auc':0.0,'avg_acc':0.0,'avg_f1':0.0,'avg_aupr':0.0} for i in reversed(range(0,K_fold)): test_d=examples[list(subsets[i])] val_d,test_data=train_test_split(test_d,test_size=0.5) train_d=[] for j in range(0,K_fold): if i!=j: train_d.extend(examples[list(subsets[j])]) train_data=np.array(train_d) train_log=train( kfold=count, dataset=dataset, train_d=train_data, dev_d=val_d, test_d=test_data, neighbor_sample_size=neighbor_sample_size, embed_dim=32, n_depth=2, l2_weight=1e-7, lr=2e-2, #lr=5e-3, optimizer_type='adam', batch_size=2048, aggregator_type=t, n_epoch=50, callbacks_to_add=['modelcheckpoint', 'earlystopping'] ) count+=1 temp['avg_auc']=temp['avg_auc']+train_log['test_auc'] temp['avg_acc']=temp['avg_acc']+train_log['test_acc'] temp['avg_f1']=temp['avg_f1']+train_log['test_f1'] temp['avg_aupr']=temp['avg_aupr']+train_log['test_aupr'] for key in temp: if key=='aggregator_type' or key=='dataset': continue temp[key]=temp[key]/K_fold write_log(format_filename(LOG_DIR, RESULT_LOG[dataset]),temp,'a') print(f'Logging Info - {K_fold} fold result: avg_auc: {temp["avg_auc"]}, avg_acc: {temp["avg_acc"]}, avg_f1: {temp["avg_f1"]}, avg_aupr: {temp["avg_aupr"]}')
def __init__(self, type, url, title, referer, p=0, multi_post=False): self.type = type self.url = LazyUrl(referer, lambda _: url, self) ext = get_ext(url) if ext.lower() == '.php': ext = '.mp4' if type == 'video': id_ = re.find('videos/([0-9a-zA-Z_-]+)', referer, err='no video id') self.filename = format_filename(title, id_, ext) #4287 elif type == 'image': name = '{}_p{}'.format(clean_title(title), p) if multi_post else p self.filename = '{}{}'.format(name, ext) else: raise NotImplementedError(type) self.title = title
def get(self, url): if self._url: return self._url m = re.search(PATTERN_VID, url) id = m.group('id') ext = '.mp4' self.title = id # self.filename = format_filename(self.title, id, ext) ydl = ytdl.YoutubeDL() info = ydl.extract_info(url) self._url = info['url'] return self._url
def process_data(dataset: str, neighbor_sample_size: int): user_vocab = {} item_vocab = {} entity_vocab = {} relation_vocab = {} read_item2entity_file(ITEM2ENTITY_FILE[dataset], item_vocab, entity_vocab) train_data, dev_data, test_data = read_rating_file(RATING_FILE[dataset], SEPARATOR[dataset], THRESHOLD[dataset], user_vocab, item_vocab) adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab, relation_vocab, neighbor_sample_size) pickle_dump(format_filename(PROCESSED_DATA_DIR, USER_VOCAB_TEMPLATE, dataset=dataset), user_vocab) pickle_dump(format_filename(PROCESSED_DATA_DIR, ITEM_VOCAB_TEMPLATE, dataset=dataset), item_vocab) pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset), entity_vocab) pickle_dump(format_filename(PROCESSED_DATA_DIR, RELATION_VOCAB_TEMPLATE, dataset=dataset), relation_vocab) train_data_file = format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, dataset=dataset) np.save(train_data_file, train_data) print('Logging Info - Saved:', train_data_file) dev_data_file = format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, dataset=dataset) np.save(dev_data_file, dev_data) print('Logging Info - Saved:', dev_data_file) test_data_file = format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, dataset=dataset) np.save(test_data_file, test_data) print('Logging Info - Saved:', test_data_file) adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset) np.save(adj_entity_file, adj_entity) print('Logging Info - Saved:', adj_entity_file) adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset) np.save(adj_relation_file, adj_relation) print('Logging Info - Saved:', adj_entity_file)
def get(self, url_page): if not self._url: id = get_id(url_page) html = downloader.read_html(url_page) soup = Soup(html, unescape=True) self.title = soup.find('title').text.replace('- XVIDEOS.COM', '').strip() url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html) ext = get_ext(url) if ext.lower() == '.m3u8': url = playlist2stream(url, n_thread=5) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] self.thumb = BytesIO() downloader.download(url_thumb, buffer=self.thumb) self.filename = format_filename(self.title, id, '.mp4') self._url= url return self._url
def __init__(self, f, info): self.title = title = info['title'] self.id = info['id'] self.url = f['url'] self.thumb = BytesIO() downloader.download(info['thumbnail'], buffer=self.thumb) ext = get_ext(self.url) if ext.lower() == '.m3u8': raise NotImplementedError('stream') # url = M3u8_stream(self.url, n_thread=4) else: url = self.url self.url = LazyUrl(self.url, lambda x: url, self) self.filename = format_filename(title, self.id, ext)
def get(self, url): if self._url is None: self.info = get_info(url) self.title = self.info['title'] id = self.info['id'] video_best = self.info['formats'][(-1)] self._url = video_best['url'] ext = get_ext(self._url) self.filename = format_filename(self.title, id, ext) if isinstance(self._url, str) and 'referer=force' in self._url.lower(): self._referer = self._url else: self._referer = url return self._url, self._referer
def get(self, url): if self._url: return self._url ydl = ytdl.YoutubeDL() info = ydl.extract_info(url) fs = [f for f in info['formats'] if f['ext'] == 'mp4'] f = sorted(fs, key=lambda f: f['height'])[-1] self._url = f['url'] self.thumb_url = info['thumbnails'][0]['url'] self.thumb = IO() downloader.download(self.thumb_url, buffer=self.thumb) self.title = info['title'] ext = get_ext(self._url) self.filename = format_filename(self.title, info['id'], ext) return self._url
def main(args) -> None: """ This is the main pipe line to analyze barseq counts. """ # creating folder to put log file and barcode counts runner = Run(args) # here we create folder name which is equal to experiment name make_barseq_directories(runner) # if there is already folder then this will return error massage # Add file handler fh = logging.FileHandler(runner.log, mode="w") # creating a log file fh.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(module)s - %(message)s", datefmt="%Y-%m-%d %H:%M")) logger.addHandler(fh) logger.info("***** Starting barseq *****") # read barcode from fasta files logger.info(f"Reading in barcodes from {runner.barcodes.name}") # read barcode # barcodes = read_barcodes(runner.barcodes) # this is the old script barcodes = read_barcodes_new(runner.barcodes) # this is the old script # Process each sequencing file seq_files_list = sorted(os.listdir(runner.sequences)) for seq_file in seq_files_list: if not seq_file.endswith(".DS_Store"): sample = format_filename(seq_file) logger.info(f"Counting Barcodes in {sample}") runner.sample_dict[sample+'_F'] = deepcopy(barcodes) runner.sample_dict[sample+'_R'] = deepcopy(barcodes) # Change cwd with Cd(runner.sequences): count_barcodes(seq_file, runner.sample_dict,[sample+'_F',sample+'_R']) # Write to output logger.info(f"Writing results to {runner.path}") write_output(runner.sample_dict, barcodes, runner) # Confirm completion of barseq logger.info("***** barseq is complete! *****")
def read(self): page = get_page(self.url) videos, info = get_videos(self.url, self.customWidget) if not videos: raise Exception('No videos') for video in videos: self.urls.append(video.url) thumb = BytesIO() downloader.download(info['url_thumb'], buffer=thumb) self.setIcon(thumb) title = info['title'] if page is not None: title += (u'_p{}').format(page) title = format_filename(title, self.id_, '.mp4')[:-4] n = int(math.ceil(8.0 / len(videos))) self.customWidget.print_(('n_threads: {}').format(n)) self.enableSegment(n_threads=n) self.title = title
def __init__(self, info, stream): self.info = info self.id = info['id'] self.title = info['name'] self.brand = info['brand'] self.url = stream['url'] self.url_thumb = info['poster_url'] self.thumb = IO() downloader.download(self.url_thumb, buffer=self.thumb) ext = os.path.splitext(self.url.split('?')[0].split('#')[0])[1] if ext.lower() == '.m3u8': print('read m3u8:', self.url) ext = '.mp4' self.url = M3u8_stream(self.url, deco=decrypt, n_thread=4) else: size = downloader.get_size(self.url) if size <= 0: raise Exception('Size is 0') self.filename = format_filename('[{}] {}'.format(self.brand, self.title), self.id, ext)
def _process_log(self, started, current_log_file, previous_log_file, dump_file): """ Read records from associated log files starting at `started` time and dump their statistics to `dump_file`. @param datetime started: timestamp for the beginning of the tracked period @param str current_log_file: path to access log file @param str previous_log_file: if in-place rotation of access logs is used, path to log file before current @param str dump_file: file to save aggregated data """ file_processing_starts = datetime.datetime.now() #Reset all storages self.sm.reset(dump_file) #Save metadata self.sm.get('metadata').set(dump_file, 'daemon_invoked', started.strftime('%Y-%m-%d %H:%M:%S')) self.sm.get('metadata').set(dump_file, 'daemon_version', 'v'+daemon_version) #Generate file names from a template and timestamps file_at_period_start, params_at_period_start = utils.format_filename(current_log_file, self.period_start) file_at_started, params_at_started = utils.format_filename(current_log_file, started) if not os.path.exists(file_at_started): logger.error('File %s is not found and will not be processed' % file_at_started) elif file_at_period_start != file_at_started and not os.path.exists(file_at_period_start): logger.error('File %s is not found and will not be processed' % file_at_period_start) else: #If the daemon has just started, it does not have associated seek for the input file #and it has to be set to period_start if not file_at_period_start in self.seek.keys(): self.seek[file_at_period_start] = seek_utils.get_seek( file_at_period_start, self.period_start + params_at_period_start['ts']) if file_at_period_start == file_at_started: #All the records we are interested in are in the same file current_file_size = os.stat(file_at_started).st_size #Processing the situation when the log was rotated in-place between daemon executions. #In this situation we start reading the file from the beginning. cur_seek = self.seek[file_at_started] read_from_start = True if current_file_size < cur_seek or cur_seek == 0 else False if read_from_start and previous_log_file: replaced_file, params_at_replaced = utils.format_filename(previous_log_file, started) if not os.path.exists(replaced_file): logger.error('File %s is not found and will not be processed' % replaced_file) else: self.seek[replaced_file] = \ cur_seek if cur_seek > 0 else seek_utils.get_seek( replaced_file, self.period_start + params_at_replaced['ts']) self._parse_file(dump_file, replaced_file) self._parse_file(dump_file, file_at_started, read_from_start, started + params_at_started['ts']) else: #First read previous file to the end, then current from beginning self._parse_file(dump_file, file_at_period_start) self._parse_file(dump_file, file_at_started, True, started + params_at_started['ts']) #Store execution time in metadata section of the report file_processing_ends = datetime.datetime.now() worked = file_processing_ends - file_processing_starts self.sm.get('metadata').set(dump_file, 'daemon_worked', '%d.%d sec' % (worked.seconds, worked.microseconds/10000)) #Save report self.sm.dump(dump_file)