Esempio n. 1
0
def encode(parallelize=True, hdf5_path=c.RECORDING_DIR, experiment=None):
    # TODO(post v3): Get a couple separate hdf5 files from different
    #  situations / view modes for eval
    hdf5_to_convert_path = get_hdf5_path_to_convert(hdf5_path)
    train_dataset = get_dataset(hdf5_to_convert_path, train=True)
    eval_dataset = get_dataset(hdf5_to_convert_path, train=False)
    buffer_size = 1000
    utils.assert_disk_space(hdf5_path)
    out_path = None
    while experiment is None:
        experiment = utils.get_valid_filename(
            input('Enter a name for your dataset: '))
        out_path = os.path.join(hdf5_path, experiment + c.TFRECORD_DIR_SUFFIX)
        if os.path.exists(out_path):
            print('The path %s exists, please choose a new name.' % out_path)
            experiment = None
    if out_path is None:
        raise RuntimeError('tfrecord output path not set')
    os.makedirs(out_path)
    save_dataset(train_dataset, buffer_size,
                 filename=os.path.join(out_path, 'deepdrive_train'),
                 parallelize=parallelize, out_path=out_path)
    save_dataset(eval_dataset, buffer_size,
                 filename=os.path.join(out_path, 'deepdrive_eval'),
                 parallelize=parallelize, out_path=out_path)
Esempio n. 2
0
def main(file_path,
         output_folder,
         file_type,
         download_n_files,
         max_size=None,
         n_jobs=1):
    df = pds.read_csv(file_path, sep=";").sample(frac=1)

    # naively filter the df to get only the desired file_type
    df = df[df.format == file_type]
    if download_n_files:
        df = df.iloc[:download_n_files]
    print(f"There are {len(df)} resources of type {file_type}")
    urls = df["url"].values
    resource_ids = df["id"].values
    dataset_ids = df["dataset.id"].values
    new_ids = dataset_ids + "--" + resource_ids
    organizations = df["dataset.organization"].fillna("NA").apply(
        lambda x: unidecode.unidecode(get_valid_filename(x))).values
    assert len(urls) == len(new_ids)

    if n_jobs > 1:
        succes_downloaded = Parallel(n_jobs=n_jobs)(
            delayed(downloader)(url, id, org, output_folder, file_type,
                                max_size)
            for url, id, org in tqdm(list(zip(urls, new_ids, organizations))))
    else:
        succes_downloaded = []
        for url, id, org in tqdm(list(zip(urls, new_ids, organizations))):
            succes_downloaded.append(
                downloader(url, id, org, output_folder, file_type, max_size))
    print(
        f"I successfully downloaded {sum(succes_downloaded)} of {len(succes_downloaded)} files"
    )
Esempio n. 3
0
    def watch(self):
        curr_time = datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")
        file_name = curr_time + " - " + self.streamer + " - " + get_valid_filename(self.stream_title) + ".ts"
        dir = os.getcwd() + os.path.sep + self.streamer + os.path.sep
        if not os.path.exists(dir):
            os.makedirs(dir)
        output_filepath = dir + file_name
        self.streamer_dict.update({'output_filepath': output_filepath})

        streams = streamlink.streams(self.streamer_dict['stream_info']['channel']['url'])
        try:
            stream = streams[self.stream_quality]
        except KeyError:
            temp_quality = self.stream_quality
            if len(streams) > 0:  # False => stream is probably offline
                if self.stream_quality in streams.keys():
                    self.stream_quality = StreamQualities.BEST.value
                else:
                    self.stream_quality = list(streams.keys())[-1]  # best not in streams? choose best effort quality
            else:
                self.cleanup = True

            if not self.cleanup:
                print('Invalid stream quality: ' + '\'' + temp_quality + '\'')
                print('Falling back to default case: ' + self.stream_quality)
                self.streamer_dict['preferred_quality'] = self.stream_quality
                stream = streams[self.stream_quality]
            else:
                stream = None

        if not self.kill and not self.cleanup and stream:
            print(self.streamer + ' is live. Saving stream in ' +
                  self.stream_quality + ' quality to ' + output_filepath + '.')

            try:
                with open(output_filepath, "ab") as out_file:  # open for [a]ppending as [b]inary
                    fd = stream.open()

                    while not self.kill and not self.cleanup:
                        data = fd.read(1024)

                        # If data is empty the stream has ended
                        if not data:
                            fd.close()
                            out_file.close()
                            break

                        out_file.write(data)
            except streamlink.StreamError as err:
                print('StreamError: {0}'.format(err))  # TODO: test when this happens
            except IOError as err:
                # If file validation fails this error gets triggered.
                print('Failed to write data to file: {0}'.format(err))
            self.streamer_dict.update({'kill': self.kill})
            self.streamer_dict.update({'cleanup': self.cleanup})
            return self.streamer_dict
Esempio n. 4
0
def save_product(product):
    os.makedirs(jumbo_products_path, exist_ok=True)
    try:
        with open(
                os.path.join(
                    jumbo_products_path,
                    get_valid_filename(product['title']) + '.json')
                , 'w') as f:
            json.dump(product, f)
    except KeyError:
        print('Could not save product with id: {}'.format(product['id']))
Esempio n. 5
0
def main():
    colorama.init()
    parser = argparse.ArgumentParser(
        description=
        'Fetch top level comments from a reddit post (eg: python3 fetch_comments.py -l https://www.reddit.com/r/AskReddit/comments/75goki/whats_a_movie_to_watch_when_you_want_a_good_cry/)'
    )
    parser.add_argument('-l',
                        '--link',
                        type=str,
                        metavar='',
                        required=True,
                        help='Link of the post')
    parser.add_argument(
        '-s',
        '--sort',
        type=str,
        metavar='',
        choices=['best', 'top', 'new'],
        default='confidence',
        help='Optionally specify order of sort [best or top or new]')
    parser.add_argument(
        '-loc',
        '--location',
        type=str,
        metavar='',
        default=os.getcwd() + '/comments',
        help='Optionally specify the directory/location to be downloaded')
    args = parser.parse_args()

    # initializing userAgent
    ua = get_userAgent()

    print('Connecting to reddit..')
    url = args.link + '.json?sort='
    if args.sort == 'best':
        args.sort = 'confidence'

    top_level_comments = get_comments(url + args.sort, ua)

    filename = get_valid_filename(top_level_comments[0]) + "_comments.txt"

    if not os.path.exists(args.location):
        os.makedirs(args.location)

    location = os.path.join(args.location, filename)
    output_filehandle = open(location, mode='w', encoding='utf8')
    output_text = '\n'.join(top_level_comments)
    output_filehandle.write(output_text)
    output_filehandle.write('\n')
    erase_previous_line()
    print('Downloaded comments')
Esempio n. 6
0
def scrape_products():
    for category_url in category_urls:
        print(category_url)
        r = requests.get(category_url)
        category_json = r.json()

        # json_data = None
        # with open('category.json', 'r') as f:
        #     json_data = json.load(f)

        jsonpath_expr = parse('$..navItem.link.href')

        products = [match.value for match in jsonpath_expr.find(category_json)]

        products = [x for x in products if x.startswith('/producten/product')]

        all_products = {}
        for product in products:
            r = requests.get(base_url + product)
            tree = html.fromstring(r.content)
            # json_ld = tree.xpath('string(/html/head/script[7])')

            javascript = tree.xpath('string(/html/body/script/text())')[54:-14]
            javascript = ast.literal_eval(javascript)
            javascript = json.loads(javascript)

            if javascript['product']['state'] == 'ERROR':
                print('ERROR state detected:')
                print(javascript)
                continue

            product_info = javascript['product']['card']
            title = product_info['products'][0]['title']

            try:
                os.makedirs('products/ah')
            except FileExistsError:
                pass

            with open('products/ah/{}.json'.format(get_valid_filename(title)), 'w') as f:
                json.dump(product_info, f)

            all_products[title] = product_info
            print(product_info['products'][0]['title'])

    print(products)
Esempio n. 7
0
def _plot_precision_recall_curve(df, test_set_id):
    results_df = df[df.test_set_id == test_set_id]
    results_df = _sort_models_by_list(results_df, default_models_sorter)

    plt.figure(figsize=(4, 4))
    count = 0
    color_list = [
        "#640f79", "#66d9cf", "#d032a3", "#8138fc", "#8fda59", 'black'
    ]
    linestyle_list = ['dotted', 'dashdot', 'solid']
    for _, row in results_df.iterrows():
        color = color_list[count]
        linestyle = linestyle_list[count // 2]
        plt.plot(row.precision_recall_curve[1],
                 row.precision_recall_curve[0],
                 label=row.fancy_model_name,
                 linestyle=linestyle,
                 color=color,
                 linewidth=1.2)
        count += 1

    if FLAGS.show_random_guess:
        no_skill = _calculate_no_skill_model(df, test_set_id)
        plt.plot([0, 1], [no_skill, no_skill],
                 linestyle='--',
                 color='gainsboro',
                 linewidth=1.2,
                 label='Random guess')
        legend = plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
    else:
        legend = plt.legend(loc='lower left')
    plt.xlim(0.0, 1.00)
    plt.ylim(0.0, 1.05)
    plt.xlabel('Recall')
    plt.ylabel('Precision')

    for file_format in FLAGS.chart_format_list:
        chart_file_name = test_set_id + '.' + file_format
        chart_file_name = utils.get_valid_filename(chart_file_name)
        chart_file_name = os.path.join(FLAGS.charts_path, chart_file_name)

        plt.savefig(chart_file_name,
                    dpi=600,
                    bbox_extra_artists=(legend, ),
                    bbox_inches='tight')
        print('Saved chart to %s' % chart_file_name)
Esempio n. 8
0
def get_pictures_from_subreddit(data, subreddit, location, nsfw):
    for i in range(len(data)):
        if data[i]['data']['over_18']:
            # if nsfw post and you only want sfw
            if nsfw == 'n':
                continue
        else:
            # if sfw post and you only want nsfw
            if nsfw == 'x':
                continue

        current_post = data[i]['data']
        image_url = current_post['url']
        if '.png' in image_url:
            extension = '.png'
        elif '.jpg' in image_url or '.jpeg' in image_url:
            extension = '.jpeg'
        elif 'imgur' in image_url:
            image_url += '.jpeg'
            extension = '.jpeg'
        else:
            continue

        erase_previous_line()
        print('downloading pictures from r/' + subreddit + '.. ' +
              str((i * 100) // len(data)) + '%')

        # redirects = False prevents thumbnails denoting removed images from getting in
        image = requests.get(image_url, allow_redirects=False)
        if (image.status_code == 200):
            try:
                output_filehandle = open(
                    location + '/' +
                    get_valid_filename(current_post['title']) + extension,
                    mode='bx')
                output_filehandle.write(image.content)
            except:
                pass
 def get_audio_filename(self, text):
     return '/tmp/{}_{}.mp3'.format(
         get_valid_filename(text),
         hashlib.md5(text.encode('utf8')).hexdigest())
    def watch(self):
        curr_time = datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")
        file_name = curr_time + " - " + self.streamer + " - " + get_valid_filename(
            self.stream_title) + ".ts"
        directory = self._formatted_download_folder(
            self.streamer_login) + os.path.sep
        if not os.path.exists(directory):
            os.makedirs(directory)
        output_filepath = directory + file_name
        self.streamer_dict.update({'output_filepath': output_filepath})

        streams = streamlink.streams('https://www.twitch.tv/' +
                                     self.streamer_login)
        # Occurs when already recording another stream and new streamer (that is already live) is added
        # not sure why this error is thrown..
        # Traceback (most recent call last):
        #   File "C:\Program Files\Python36\lib\threading.py", line 916, in _bootstrap_inner
        #     self.run()
        #   File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\stream\segmented.py", line 59, in run
        #     for segment in self.iter_segments():
        #   File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\stream\hls.py", line 307, in iter_segments
        #     self.reload_playlist()
        #   File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\stream\hls.py", line 235, in reload_playlist
        #     self.process_sequences(playlist, sequences)
        #   File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\plugins\twitch.py", line 210, in process_sequences
        #     return super(TwitchHLSStreamWorker, self).process_sequences(playlist, sequences)
        # TypeError: super(type, obj): obj must be an instance or subtype of type
        try:
            stream = streams[self.stream_quality]
        except KeyError:
            temp_quality = self.stream_quality
            if len(streams) > 0:  # False => stream is probably offline
                if self.stream_quality in streams.keys():
                    self.stream_quality = StreamQualities.BEST.value
                else:
                    self.stream_quality = list(streams.keys())[
                        -1]  # best not in streams? choose best effort quality
            else:
                self.cleanup = True

            if not self.cleanup:
                print('Invalid stream quality: ' + '\'' + temp_quality + '\'')
                print('Falling back to default case: ' + self.stream_quality)
                self.streamer_dict['preferred_quality'] = self.stream_quality
                stream = streams[self.stream_quality]
            else:
                stream = None

        if not self.kill and not self.cleanup and stream:
            print(self.streamer + ' is live. Saving stream in ' +
                  self.stream_quality + ' quality to ' + output_filepath + '.')

            try:
                with open(
                        output_filepath,
                        "ab") as out_file:  # open for [a]ppending as [b]inary
                    fd = stream.open()

                    while not self.kill and not self.cleanup:
                        data = fd.read(1024)

                        # If data is empty the stream has ended
                        if not data:
                            fd.close()
                            out_file.close()
                            break

                        out_file.write(data)
            except streamlink.StreamError as err:
                print('StreamError: {0}'.format(
                    err))  # TODO: test when this happens
            except IOError as err:
                # If file validation fails this error gets triggered.
                print('Failed to write data to file: {0}'.format(err))
            self.streamer_dict.update({'kill': self.kill})
            self.streamer_dict.update({'cleanup': self.cleanup})
            return self.streamer_dict
Esempio n. 11
0
    def perform(self):
        if not self.check_if_needed():
            return

        dest_file_name = base_date.strftime("%Y-%m-%d_%H%M%S_") + utils.get_valid_filename(self.name)

        self.process_vars([
            ['DEST_FILENAME', dest_file_name],
            ['SRC_DIR', self.src_dir],
            ['DEST_DIR', self.dest_dir],
        ])

        dest_path = self.dest_dir + dest_file_name

        if self.do_compress in ['gzip', 'store']:
            ext = {'gzip': 'tgz', 'store': 'tar'}[self.do_compress]

            dest_path_compressed = self.dest_dir + dest_file_name + "." + ext
            if os.path.exists(dest_path_compressed):
                report.log_warn("Destination file exists {0}".format(self.cc(dest_path_compressed)))
                return

            dest_path_compressed_tmp = dest_path_compressed + ".tmp"

            report.log_state("Changing directory to {0}".format(self.cc(self.src_dir)))
            os.chdir(self.src_dir)

            report.log_state("Creating archive {0} to {1}...".format(self.cc(self.src_dir), self.cc(dest_path_compressed_tmp)))

            if self.do_compress == 'gzip':
                cmd = "tar --create . | pigz > {0}".format(shlex.quote(dest_path_compressed_tmp))
            else:
                cmd = "tar --create --file {0} .".format(shlex.quote(dest_path_compressed_tmp))

            report.log_command(cmd)
            process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

            processLimit = None
            if self.cpu_limit:
                limit_cmd = "cpulimit --lazy --include-children --pid={0} --limit={1}".format(process.pid, self.cpu_limit)
                report.log_command(limit_cmd)
                processLimit = subprocess.Popen(limit_cmd, shell=True)
            else:
                pass

            r = process.wait()
            if processLimit is not None:
                processLimit.wait()

            tar_output = process.stdout.read()

            if r == 0:
                report.log_state("Moving {0} to {1}".format(self.cc(dest_path_compressed_tmp), self.cc(dest_path_compressed)))
                os.rename(dest_path_compressed_tmp, dest_path_compressed)
            else:
                raise BackupException("tar failed", tar_output)
        else:
            if os.path.exists(dest_path):
                report.log_warn("Destination folder exists {0}".format(self.cc(dest_path)))
                return

            dest_path_tmp = dest_path + "_tmp"
            report.log_state("Copying {0} to {1}...".format(self.cc(self.src_dir), self.cc(dest_path_tmp)))
            try:
                if os.path.exists(dest_path_tmp):
                    shutil.rmtree(dest_path_tmp)
                shutil.copytree(self.src_dir, dest_path_tmp)
                os.rename(dest_path_tmp, dest_path)
            except Exception as err:
                report.log_warn(err)