def load_data(data_type,
              train_on_cv=False,
              cv_random_state=42,
              cv_fold=5,
              cv_index=0):
    if data_type == 'train':
        if train_on_cv:
            data = pickle_load(
                format_filename(PROCESSED_DATA_DIR,
                                TRAIN_CV_DATA_TEMPLATE,
                                random=cv_random_state,
                                fold=cv_fold,
                                index=cv_index))
        else:
            data = pickle_load(
                format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_FILENAME))
    elif data_type == 'dev':
        if train_on_cv:
            data = pickle_load(
                format_filename(PROCESSED_DATA_DIR,
                                DEV_CV_DATA_TEMPLATE,
                                random=cv_random_state,
                                fold=cv_fold,
                                index=cv_index))
        else:
            data = pickle_load(
                format_filename(PROCESSED_DATA_DIR, DEV_DATA_FILENAME))
    elif data_type == 'test':
        data = pickle_load(
            format_filename(PROCESSED_DATA_DIR, TEST_DATA_FILENAME))
    else:
        raise ValueError('data type not understood: {}'.format(data_type))
    return data
Esempio n. 2
0
def process_data(dataset: str, neighbor_sample_size: int, K: int):
    drug_vocab = {}
    entity_vocab = {}
    relation_vocab = {}

    read_entity2id_file(ENTITY2ID_FILE[dataset], drug_vocab, entity_vocab)

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        DRUG_VOCAB_TEMPLATE,
                        dataset=dataset), drug_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        ENTITY_VOCAB_TEMPLATE,
                        dataset=dataset), entity_vocab)

    examples_file = format_filename(PROCESSED_DATA_DIR,
                                    DRUG_EXAMPLE,
                                    dataset=dataset)
    examples = read_example_file(EXAMPLE_FILE[dataset], SEPARATOR[dataset],
                                 drug_vocab)
    print(len(examples))
    #example contains postive samples and negative samples
    #example:[drug1 drug2 interaction]
    np.save(examples_file, examples)

    adj_entity_file = format_filename(PROCESSED_DATA_DIR,
                                      ADJ_ENTITY_TEMPLATE,
                                      dataset=dataset)
    adj_relation_file = format_filename(PROCESSED_DATA_DIR,
                                        ADJ_RELATION_TEMPLATE,
                                        dataset=dataset)

    adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab,
                                       relation_vocab, neighbor_sample_size)

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        DRUG_VOCAB_TEMPLATE,
                        dataset=dataset), drug_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        ENTITY_VOCAB_TEMPLATE,
                        dataset=dataset), entity_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        RELATION_VOCAB_TEMPLATE,
                        dataset=dataset), relation_vocab)
    adj_entity_file = format_filename(PROCESSED_DATA_DIR,
                                      ADJ_ENTITY_TEMPLATE,
                                      dataset=dataset)
    np.save(adj_entity_file, adj_entity)
    print('Logging Info - Saved:', adj_entity_file)

    adj_relation_file = format_filename(PROCESSED_DATA_DIR,
                                        ADJ_RELATION_TEMPLATE,
                                        dataset=dataset)
    np.save(adj_relation_file, adj_relation)
    print('Logging Info - Saved:', adj_entity_file)
    cross_validation(K, examples, dataset, neighbor_sample_size)
Esempio n. 3
0
def load_idx2cate():
    idx2cate1 = pickle_load(
        format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, level='cate1'))
    idx2cate2 = pickle_load(
        format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, level='cate2'))
    idx2cate3 = pickle_load(
        format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, level='cate3'))
    return idx2cate1, idx2cate2, idx2cate3
Esempio n. 4
0
    def get(self, url):
        print_ = get_print(self.cw)
        if self._url:
            return self._url
        ydl = ytdl.YoutubeDL(cw=self.cw)
        try:
            info = ydl.extract_info(url)
        except Exception as e:
            ex = type(ytdl.get_extractor(url))(ydl)
            _download_info = getattr(ex, '_download_info', None)
            if _download_info is not None:
                vod_id = ex._match_id(url)
                info = _download_info(vod_id)
                print_(info)
            raise
        video_best = info['formats'][-1]
        video = video_best['url']

        ext = get_ext(video)
        self.title = info['title']
        id = info['display_id']

        if ext.lower() == '.m3u8':
            video = M3u8_stream(video, n_thread=4, alter=alter)
            ext = '.mp4'
        self.filename = format_filename(self.title, id, ext)
        self.url_thumb = info['thumbnail']
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
        self._url = video
        return self._url
Esempio n. 5
0
    def get(self, url):
        if self._url:
            return self._url

        ydl = ytdl.YoutubeDL()
        info = ydl.extract_info(url)

        # get best video
        fs = info['formats']
        fs = sorted(fs, key=lambda x: int(x['width']), reverse=True)
        f = fs[0]
        url_video = f['url']

        # thumb
        self.thumb_url = info['thumbnails'][0]['url']
        self.thumb = BytesIO()
        downloader.download(self.thumb_url, buffer=self.thumb)

        # m3u8
        print(f['protocol'])
        if 'm3u8' in f['protocol']:
            url_video = M3u8_stream(url_video, referer=url)

        # title & filename
        self.title = info['title']
        self.filename = format_filename(self.title, info['id'], '.mp4')

        self._url = url_video

        return self._url
Esempio n. 6
0
def read_album(url, session=None):
    '''
    read_album
    '''
    soup = downloader.read_soup(url, session=session)
    id_album = re.find('/album/([0-9]+)', url, err='no album id')
    url_json = 'https://www.pornhub.com/album/show_album_json?album={}'.format(id_album)
    data = downloader.read_json(url_json, url, session=session)
    block = soup.find('div', class_='photoAlbumListBlock')
    href = block.a.attrs['href']
    id_ = re.find('/photo/([0-9]+)', href, err='no photo id')
    ids = [id_]
    while True:
        item = data[id_]
        id_ = item['next']
        if id_ in ids:
            break
        ids.append(id_)

    photos = []
    for id_ in ids:
        item = data[id_]
        img = item['img_large']
        referer = 'https://www.pornhub.com/photo/{}'.format(id_)
        photo = Photo(id_, img, referer)
        photos.append(photo)

    info = {}
    title = clean_title(soup.find('h1', class_='photoAlbumTitleV2').text)
    info['title'] = format_filename(title, 'album_{}'.format(id_album))
    info['photos'] = photos
    return info
Esempio n. 7
0
    def __init__(self, f, f_audio, info, session, referer, cw=None):
        self.f_audio = f_audio
        self.cw = cw
        self.title = title = info['title']
        self.id = info['id']
        self.url = f['url']
        self.artist = info.get('uploader')
        self.header = utils.capitalize(get_ie_key(info))
        self.session = session
        self.referer = referer

        self.url_thumb = info.get('thumbnail')
        self.thumb = BytesIO()
        if self.url_thumb:
            downloader.download(self.url_thumb, referer=referer, buffer=self.thumb, session=session)

        ext = get_ext_(self.url, session, referer)

        if not ext:
            print('empty ext')
            if f['_resolution']:
                ext = '.mp4'
            else:
                ext = '.mp3'

        if ext.lower() == '.m3u8':
            try:
                url = playlist2stream(self.url, referer, session=session, n_thread=4)
            except:
                url = M3u8_stream(self.url, referer=referer, session=session, n_thread=4)
            ext = '.mp4'
        else:
            url = self.url
        self.url = LazyUrl(referer, lambda x: url, self, pp=self.pp)
        self.filename = format_filename(title, self.id, ext, header=self.header)
Esempio n. 8
0
    def register_sensor(self):
        with open('./examples/sensor_example', 'r') as content_file:
            j_sensor = json.load(content_file)

        self.app.post('/register', data=json.dumps(j_sensor))
        sensor = models.Sensor.query.get(1)

        assert sensor.id == 1
        assert j_sensor['name'] in sensor.name
        assert j_sensor['type'] in sensor.type
        assert j_sensor['location'] == sensor.location
        assert sensor.alive == 1
        assert sensor.events.count() == 0

        assert isfile(views.schema_prefix + format_filename(sensor.type) + views.schema_suffix)
        with open(views.schema_prefix + format_filename(sensor.type) + views.schema_suffix, 'r') as content_file:
            assert json.dumps(j_sensor['event_definition']) in content_file.read()
Esempio n. 9
0
 def __init__(self, stream, referer, id, title, url_thumb):
     self.url = LazyUrl(referer, lambda x: stream, self)
     self.id = id
     self.title = title
     self.filename = format_filename(title, id, '.mp4')
     self.url_thumb = url_thumb
     self.thumb = BytesIO()
     downloader.download(url_thumb, buffer=self.thumb)
Esempio n. 10
0
    def __init__(self, url, url_thumb, referer, title, id, session):
        self.title = title
        self.filename = format_filename(title, id, '.mp4')
        self.url = LazyUrl(referer, lambda x: url, self)

        self.thumb = BytesIO()
        self.url_thumb = url_thumb
        downloader.download(url_thumb, buffer=self.thumb, session=session)
Esempio n. 11
0
def load_data(dataset: str, data_type: str):
    if data_type == 'train':
        return np.load(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_DATA_TEMPLATE,
                            dataset=dataset))
    elif data_type == 'dev':
        return np.load(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_DATA_TEMPLATE,
                            dataset=dataset))
    elif data_type == 'test':
        return np.load(
            format_filename(PROCESSED_DATA_DIR,
                            TEST_DATA_TEMPLATE,
                            dataset=dataset))
    else:
        raise ValueError('`data_type` not understood: {data_type}')
Esempio n. 12
0
def cv_split(train_data,
             dev_data,
             cate3_vocab,
             fold=5,
             balanced=True,
             random_state=42):
    def indexing_data(data, indices):
        part_data = {}
        for k in data.keys():
            part_data[k] = [data[k][i] for i in indices]
        return part_data

    all_data = {}
    for key in train_data.keys():
        all_data[key] = train_data[key] + dev_data[key]

    # some category in validation set is not in cate3_vocab
    cate3_id_list = [cate3_vocab.get(cate3, 0) for cate3 in all_data['cate3']]
    index_range = np.arange(len(all_data['id']))

    if balanced:
        kf = StratifiedKFold(n_splits=fold,
                             shuffle=True,
                             random_state=random_state)
    else:
        kf = KFold(n_splits=fold, shuffle=True, random_state=random_state)

    for idx, (train_index,
              dev_index) in enumerate(kf.split(index_range, cate3_id_list)):
        train_data_fold = indexing_data(all_data, train_index)
        dev_data_fold = indexing_data(all_data, dev_index)

        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_CV_DATA_TEMPLATE,
                            random=random_state,
                            fold=fold,
                            index=idx), train_data_fold)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_CV_DATA_TEMPLATE,
                            random=random_state,
                            fold=fold,
                            index=idx), dev_data_fold)
Esempio n. 13
0
    def get(self, url):
        print_ = get_print(self.cw)
        if self._url:
            return self._url
        ydl = ytdl.YoutubeDL(cw=self.cw)
        try:
            info = ydl.extract_info(url)
        except Exception as e:
            ex = type(ytdl.get_extractor(url))(ydl)
            _download_info = getattr(ex, '_download_info', None)
            if _download_info is not None:
                vod_id = ex._match_id(url)
                info = _download_info(vod_id)
                print_(info)
            if 'HTTPError 403' in str(e):
                raise errors.LoginRequired()
            raise

        def print_video(video):
            print_('[{}] [{}] [{}] {}'.format(video['format_id'],
                                              video.get('height'),
                                              video.get('tbr'), video['url']))

        videos = [video for video in info['formats'] if video.get('height')]

        videos = sorted(videos,
                        key=lambda video:
                        (video.get('height', 0), video.get('tbr', 0)),
                        reverse=True)

        for video in videos:
            print_video(video)

        for video in videos:
            if video.get('height', 0) <= get_resolution():  #3723
                video_best = video
                break
        else:
            video_best = videos[-1]
        print_video(video)

        video = video_best['url']

        ext = get_ext(video)
        self.title = info['title']
        id = info['display_id']

        if ext.lower() == '.m3u8':
            video = M3u8_stream(video, n_thread=4, alter=alter)
            ext = '.mp4'
        self.filename = format_filename(self.title, id, ext)
        self.url_thumb = info['thumbnail']
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
        self._url = video
        return self._url
 def __init__(self, url, url_page, title, url_thumb):
     self._url = url
     self.url = LazyUrl(url_page, self.get, self)
     self.id = get_id(url_page)
     self.title = title
     self.filename = format_filename(title, self.id, '.mp4')
     f = IO()
     self.url_thumb = url_thumb
     downloader.download(url_thumb, buffer=f)
     self.thumb = f
Esempio n. 15
0
    def unregister_sensor(self):
        with open('./examples/sensor_id_example', 'r') as content_file:
            j_sensor = json.load(content_file)

        sensor_type = models.Sensor.query.get(j_sensor['id']).type
        self.app.post('/unregister', data=json.dumps(j_sensor))

        assert db.session.query(models.Sensor).count() == 0
        assert db.session.query(models.Event).count() > 0

        assert not isfile(views.schema_prefix + format_filename(sensor_type) + views.schema_suffix)
Esempio n. 16
0
    def print(self, file_name, file_content):
        new_file_name = format_filename(file_name)
        full_file_path = os.path.join(self.print_folder, new_file_name)

        # Save file content to a file
        with open(full_file_path, "wb") as file:
            file.write(file_content)

        # Print file (option "-r" specifies that the file will be deleted after being submitted
        print("Print file {}".format(file_name))
        subprocess.Popen(["/usr/bin/lpr", full_file_path, "-r"])
    def get(self, url):
        if self._url:
            return self._url
        self.info = get_info(url)

        self.title = self.info['title']
        id = self.info['id']

        video_best = self.info['formats'][(-1)]
        self._url = video_best['url']
        ext = get_ext(self._url)
        self.filename = format_filename(self.title, id, ext)
        return self._url
Esempio n. 18
0
    def get(self, url):
        print_ = get_print(self.cw)
        if self._url:
            return self._url
        info = extract_info(url, self.cw)

        def print_video(video):
            print_(video)  #
            print_('{}[{}] [{}] [{}] {}'.format('LIVE ', video['format_id'],
                                                video.get('height'),
                                                video.get('tbr'),
                                                video['url']))

        videos = [video for video in info['formats'] if video.get('height')]

        videos = sorted(videos,
                        key=lambda video:
                        (video.get('height', 0), video.get('tbr', 0)),
                        reverse=True)

        for video in videos:
            print_video(video)

        for video in videos:
            if video.get('height', 0) <= get_resolution():  #3723
                video_best = video
                break
        else:
            video_best = videos[-1]
        print_video(video)

        video = video_best['url']

        ext = get_ext(video)
        self.title = info['title']
        id = info['display_id']

        if self._live:
            video = utils.LiveStream(video,
                                     headers=video_best.get('http_headers'))
            ext = '.mp4'
        else:
            if ext.lower() == '.m3u8':
                video = M3u8_stream(video, n_thread=4, alter=alter)
                ext = '.mp4'
        self.filename = format_filename(self.title, id, ext)
        self.url_thumb = info['thumbnail']
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
        self._url = video
        return self._url
    def get(self, url):
        if self._url_video:
            return self._url_video
        cw = self.cw
        print_ = get_print(cw)
        html = downloader.read_html(url)
        soup = Soup(html)

        embedUrl = extract('embedUrl', html, cw)
        if embedUrl:
            raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl))
        
        uid = extract('strLocalChUserId', html, cw)
        pid = extract('nLocalPrgId', html, cw)
        fid = extract('strFid', html, cw)
        resolType = extract('strResolType', html, cw)
        resolArr = extract('strResolArr', html, cw)
        vodSvr = extract('nVodSvr', html, cw)
        resols = extract('nInfo', html, cw)
        runtime = extract('runtime', html, cw)

        url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/'
        data = {
            'userId': uid,
            'prgId': pid,
            'fid': fid,
            'resolType': resolType,
            'resolArr': ','.join(map(str, resolArr)),
            'vodSvr': vodSvr,
            'resol': max(resols),
            'runtime': runtime,
            'tvbox': 'false',
            'defResol': 'true',
            'embed': 'false',
            }
        session = Session()
        r = session.post(url_api, headers={'Referer': url}, data=data)
        data = json.loads(r.text)
        self._url_video = data['src']

        self.title = soup.find('meta', {'property': 'og:description'})['content']
        
        ext = get_ext(self._url_video)
        self.filename = format_filename(self.title, pid, ext)

        self.url_thumb = soup.find('meta', {'property': 'og:image'})['content']
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
        
        return self._url_video
Esempio n. 20
0
    def __init__(self, session, info):
        self.session = session
        self.info = info
        self.url = info['url']
        self.title = info['title']
        self.ext = info['ext']
        self.id = info['id']

        self.fileName = format_filename(self.title, self.id, self.ext)

        self.url_thumb = info['thumbnail_url']
        print('thumb:', self.url_thumb)
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
Esempio n. 21
0
    def __init__(self, url):
        ydl = ytdl.YoutubeDL()
        info = ydl.extract_info(url)

        f = info['formats'][-1]
        url_video = f['url']
        self.url = LazyUrl(url, lambda _: url_video, self)

        self.url_thumb = info['thumbnails'][0]['url']
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
        self.title = info['title']
        ext = get_ext(url_video)
        self.filename = format_filename(self.title, info['id'], ext)
Esempio n. 22
0
def cross_validation(K_fold,examples,dataset,neighbor_sample_size):
    subsets=dict()
    n_subsets=int(len(examples)/K_fold)
    remain=set(range(0,len(examples)-1))
    for i in reversed(range(0,K_fold-1)):
        subsets[i]=random.sample(remain,n_subsets)
        remain=remain.difference(subsets[i])
    subsets[K_fold-1]=remain
    aggregator_types=['sum','concat','neigh']
    for t in aggregator_types:
        count=1
        temp={'dataset':dataset,'aggregator_type':t,'avg_auc':0.0,'avg_acc':0.0,'avg_f1':0.0,'avg_aupr':0.0}
        for i in reversed(range(0,K_fold)):
            test_d=examples[list(subsets[i])]
            val_d,test_data=train_test_split(test_d,test_size=0.5)
            train_d=[]
            for j in range(0,K_fold):
                if i!=j:
                    train_d.extend(examples[list(subsets[j])])
            train_data=np.array(train_d)               
            train_log=train(
            kfold=count,
            dataset=dataset,
            train_d=train_data,
            dev_d=val_d,
            test_d=test_data,
            neighbor_sample_size=neighbor_sample_size,
            embed_dim=32,
            n_depth=2,
            l2_weight=1e-7,
            lr=2e-2,
            #lr=5e-3,
            optimizer_type='adam',
            batch_size=2048,
            aggregator_type=t,
            n_epoch=50,
            callbacks_to_add=['modelcheckpoint', 'earlystopping']
            )     
            count+=1
            temp['avg_auc']=temp['avg_auc']+train_log['test_auc']
            temp['avg_acc']=temp['avg_acc']+train_log['test_acc']
            temp['avg_f1']=temp['avg_f1']+train_log['test_f1']
            temp['avg_aupr']=temp['avg_aupr']+train_log['test_aupr']
        for key in temp:
            if key=='aggregator_type' or key=='dataset':
                continue
            temp[key]=temp[key]/K_fold
        write_log(format_filename(LOG_DIR, RESULT_LOG[dataset]),temp,'a')
        print(f'Logging Info - {K_fold} fold result: avg_auc: {temp["avg_auc"]}, avg_acc: {temp["avg_acc"]}, avg_f1: {temp["avg_f1"]}, avg_aupr: {temp["avg_aupr"]}')
 def __init__(self, type, url, title, referer, p=0, multi_post=False):
     self.type = type
     self.url = LazyUrl(referer, lambda _: url, self)
     ext = get_ext(url)
     if ext.lower() == '.php':
         ext = '.mp4'
     if type == 'video':
         id_ = re.find('videos/([0-9a-zA-Z_-]+)', referer, err='no video id')
         self.filename = format_filename(title, id_, ext) #4287
     elif type == 'image':
         name = '{}_p{}'.format(clean_title(title), p) if multi_post else p
         self.filename = '{}{}'.format(name, ext)
     else:
         raise NotImplementedError(type)
     self.title = title
Esempio n. 24
0
    def get(self, url):
        if self._url:
            return self._url
        m = re.search(PATTERN_VID, url)
        id = m.group('id')
        ext = '.mp4'
        self.title = id  #
        self.filename = format_filename(self.title, id, ext)

        ydl = ytdl.YoutubeDL()
        info = ydl.extract_info(url)

        self._url = info['url']

        return self._url
Esempio n. 25
0
def process_data(dataset: str, neighbor_sample_size: int):
    user_vocab = {}
    item_vocab = {}
    entity_vocab = {}
    relation_vocab = {}

    read_item2entity_file(ITEM2ENTITY_FILE[dataset], item_vocab, entity_vocab)
    train_data, dev_data, test_data = read_rating_file(RATING_FILE[dataset], SEPARATOR[dataset],
                                                       THRESHOLD[dataset], user_vocab, item_vocab)
    adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab, relation_vocab,
                                       neighbor_sample_size)

    pickle_dump(format_filename(PROCESSED_DATA_DIR, USER_VOCAB_TEMPLATE, dataset=dataset),
                user_vocab)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, ITEM_VOCAB_TEMPLATE, dataset=dataset),
                item_vocab)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset),
                entity_vocab)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, RELATION_VOCAB_TEMPLATE, dataset=dataset),
                relation_vocab)

    train_data_file = format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, dataset=dataset)
    np.save(train_data_file, train_data)
    print('Logging Info - Saved:', train_data_file)

    dev_data_file = format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, dataset=dataset)
    np.save(dev_data_file, dev_data)
    print('Logging Info - Saved:', dev_data_file)

    test_data_file = format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, dataset=dataset)
    np.save(test_data_file, test_data)
    print('Logging Info - Saved:', test_data_file)

    adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset)
    np.save(adj_entity_file, adj_entity)
    print('Logging Info - Saved:', adj_entity_file)

    adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset)
    np.save(adj_relation_file, adj_relation)
    print('Logging Info - Saved:', adj_entity_file)
 def get(self, url_page):
     if not self._url:
         id = get_id(url_page)
         html = downloader.read_html(url_page)
         soup = Soup(html, unescape=True)
         self.title = soup.find('title').text.replace('- XVIDEOS.COM', '').strip()
         url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html)
         ext = get_ext(url)
         if ext.lower() == '.m3u8':
             url = playlist2stream(url, n_thread=5)
         url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
         self.thumb = BytesIO()
         downloader.download(url_thumb, buffer=self.thumb)
         self.filename = format_filename(self.title, id, '.mp4')
         self._url= url
     return self._url
    def __init__(self, f, info):
        self.title = title = info['title']
        self.id = info['id']
        self.url = f['url']

        self.thumb = BytesIO()
        downloader.download(info['thumbnail'], buffer=self.thumb)

        ext = get_ext(self.url)
        if ext.lower() == '.m3u8':
            raise NotImplementedError('stream')  #
            url = M3u8_stream(self.url, n_thread=4)
        else:
            url = self.url
        self.url = LazyUrl(self.url, lambda x: url, self)
        self.filename = format_filename(title, self.id, ext)
Esempio n. 28
0
    def get(self, url):
        if self._url is None:
            self.info = get_info(url)

            self.title = self.info['title']
            id = self.info['id']
            
            video_best = self.info['formats'][(-1)]
            self._url = video_best['url']
            ext = get_ext(self._url)
            self.filename = format_filename(self.title, id, ext)
            
            if isinstance(self._url, str) and 'referer=force' in self._url.lower():
                self._referer = self._url
            else:
                self._referer = url
        return self._url, self._referer
Esempio n. 29
0
    def get(self, url):
        if self._url:
            return self._url

        ydl = ytdl.YoutubeDL()
        info = ydl.extract_info(url)
        fs = [f for f in info['formats'] if f['ext'] == 'mp4']
        f = sorted(fs, key=lambda f: f['height'])[-1]
        self._url = f['url']

        self.thumb_url = info['thumbnails'][0]['url']
        self.thumb = IO()
        downloader.download(self.thumb_url, buffer=self.thumb)
        self.title = info['title']
        ext = get_ext(self._url)
        self.filename = format_filename(self.title, info['id'], ext)
        return self._url
Esempio n. 30
0
def main(args) -> None:
    """
    This is the main pipe line to analyze barseq counts.
    """
    # creating folder to put log file and barcode counts
    runner = Run(args) # here we create folder name which is equal to experiment name
    make_barseq_directories(runner) # if there is already folder then this will return error massage
    # Add file handler
    fh = logging.FileHandler(runner.log, mode="w")  # creating a log file
    fh.setFormatter(logging.Formatter(
        "%(asctime)s - %(levelname)s - %(module)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M"))

    logger.addHandler(fh)
    logger.info("***** Starting barseq *****")

    # read barcode from fasta files
    logger.info(f"Reading in barcodes from {runner.barcodes.name}")

    # read barcode
    # barcodes = read_barcodes(runner.barcodes) # this is the old script

    barcodes = read_barcodes_new(runner.barcodes) # this is the old script

    # Process each sequencing file
    seq_files_list = sorted(os.listdir(runner.sequences))
    for seq_file in seq_files_list:
        if not seq_file.endswith(".DS_Store"):
            sample = format_filename(seq_file)
            logger.info(f"Counting Barcodes in {sample}")
            runner.sample_dict[sample+'_F'] = deepcopy(barcodes)
            runner.sample_dict[sample+'_R'] = deepcopy(barcodes)
            # Change cwd
            with Cd(runner.sequences):

                count_barcodes(seq_file, runner.sample_dict,[sample+'_F',sample+'_R'])



    # Write to output

    logger.info(f"Writing results to {runner.path}")
    write_output(runner.sample_dict, barcodes, runner)

    # Confirm completion of barseq
    logger.info("***** barseq is complete! *****")
Esempio n. 31
0
    def read(self):
        page = get_page(self.url)
        videos, info = get_videos(self.url, self.customWidget)
        if not videos:
            raise Exception('No videos')
        for video in videos:
            self.urls.append(video.url)

        thumb = BytesIO()
        downloader.download(info['url_thumb'], buffer=thumb)
        self.setIcon(thumb)
        title = info['title']
        if page is not None:
            title += (u'_p{}').format(page)
        title = format_filename(title, self.id_, '.mp4')[:-4]
        n = int(math.ceil(8.0 / len(videos)))
        self.customWidget.print_(('n_threads: {}').format(n))
        self.enableSegment(n_threads=n)
        self.title = title
Esempio n. 32
0
 def __init__(self, info, stream):
     self.info = info
     self.id = info['id']
     self.title = info['name']
     self.brand = info['brand']
     self.url = stream['url']
     self.url_thumb = info['poster_url']
     self.thumb = IO()
     downloader.download(self.url_thumb, buffer=self.thumb)
     ext = os.path.splitext(self.url.split('?')[0].split('#')[0])[1]
     if ext.lower() == '.m3u8':
         print('read m3u8:', self.url)
         ext = '.mp4'
         self.url = M3u8_stream(self.url, deco=decrypt, n_thread=4)
     else:
         size = downloader.get_size(self.url)
         if size <= 0:
             raise Exception('Size is 0')
     self.filename = format_filename('[{}] {}'.format(self.brand, self.title), self.id, ext)
Esempio n. 33
0
    def _process_log(self, started, current_log_file, previous_log_file, dump_file):
        """
        Read records from associated log files starting at `started` time and dump their statistics to `dump_file`.

        @param datetime started: timestamp for the beginning of the tracked period
        @param str current_log_file: path to access log file
        @param str previous_log_file: if in-place rotation of access logs is used, path to log file before current
        @param str dump_file: file to save aggregated data
        """
        file_processing_starts = datetime.datetime.now()

        #Reset all storages
        self.sm.reset(dump_file)

        #Save metadata
        self.sm.get('metadata').set(dump_file, 'daemon_invoked', started.strftime('%Y-%m-%d %H:%M:%S'))
        self.sm.get('metadata').set(dump_file, 'daemon_version', 'v'+daemon_version)

        #Generate file names from a template and timestamps
        file_at_period_start, params_at_period_start = utils.format_filename(current_log_file, self.period_start)
        file_at_started, params_at_started = utils.format_filename(current_log_file, started)

        if not os.path.exists(file_at_started):
            logger.error('File %s is not found and will not be processed' % file_at_started)

        elif file_at_period_start != file_at_started and not os.path.exists(file_at_period_start):
                logger.error('File %s is not found and will not be processed' % file_at_period_start)

        else:
            #If the daemon has just started, it does not have associated seek for the input file
            #and it has to be set to period_start
            if not file_at_period_start in self.seek.keys():
                self.seek[file_at_period_start] = seek_utils.get_seek(
                    file_at_period_start, self.period_start + params_at_period_start['ts'])

            if file_at_period_start == file_at_started:
                #All the records we are interested in are in the same file
                current_file_size = os.stat(file_at_started).st_size

                #Processing the situation when the log was rotated in-place between daemon executions.
                #In this situation we start reading the file from the beginning.
                cur_seek = self.seek[file_at_started]
                read_from_start = True if current_file_size < cur_seek or cur_seek == 0 else False

                if read_from_start and previous_log_file:
                    replaced_file, params_at_replaced = utils.format_filename(previous_log_file, started)

                    if not os.path.exists(replaced_file):
                        logger.error('File %s is not found and will not be processed' % replaced_file)
                    else:
                        self.seek[replaced_file] = \
                            cur_seek if cur_seek > 0 else seek_utils.get_seek(
                                replaced_file, self.period_start + params_at_replaced['ts'])
                        self._parse_file(dump_file, replaced_file)

                self._parse_file(dump_file, file_at_started, read_from_start, started + params_at_started['ts'])
            else:
                #First read previous file to the end, then current from beginning
                self._parse_file(dump_file, file_at_period_start)
                self._parse_file(dump_file, file_at_started, True, started + params_at_started['ts'])

        #Store execution time in metadata section of the report
        file_processing_ends = datetime.datetime.now()
        worked = file_processing_ends - file_processing_starts
        self.sm.get('metadata').set(dump_file, 'daemon_worked', '%d.%d sec'
                                                                % (worked.seconds, worked.microseconds/10000))

        #Save report
        self.sm.dump(dump_file)