def collect(): headers = {'Authorization': 'Bearer ' + os.getenv('BITLY_TOKEN')} for link in links: utils.download_to_file( bitly_clicks_api.replace('{{LINK}}', link), utils.basedir() + "bitly/raw/" + utils.today() + '-' + link + ".json", headers) for link_cs in links_for_country_stats: utils.download_to_file( bitly_countries_api.replace('{{LINK}}', link_cs), utils.basedir() + "bitly/raw-countries/" + utils.today() + '-' + link_cs + ".json", headers)
def load_hans(n_samples=None, filter_label=None, filter_subset=None) -> List[TextPairExample]: out = [] if filter_label is not None and filter_subset is not None: logging.info("Loading hans subset: {}-{}...".format( filter_label, filter_subset)) else: logging.info("Loading hans all...") src = join(config.HANS_SOURCE, "heuristics_evaluation_set.txt") if not exists(src): logging.info("Downloading source to %s..." % config.HANS_SOURCE) utils.download_to_file(HANS_URL, src) with open(src, "r") as f: f.readline() lines = f.readlines() if n_samples is not None: lines = np.random.RandomState(16349 + n_samples).choice(lines, n_samples, replace=False) for line in lines: parts = line.split("\t") label = parts[0] if filter_label is not None and filter_subset is not None: if label != filter_label or parts[-3] != filter_subset: continue if label == "non-entailment": label = 0 elif label == "entailment": label = 1 else: raise RuntimeError() s1, s2, pair_id = parts[5:8] out.append(TextPairExample(pair_id, s1, s2, label)) return out
def load_hans_subsets(): src = join(config.HANS_SOURCE, "heuristics_evaluation_set.txt") if not exists(src): logging.info("Downloading source to %s..." % config.HANS_SOURCE) utils.download_to_file(HANS_URL, src) hans_datasets = [] labels = ["entailment", "non-entailment"] subsets = set() with open(src, "r") as f: for line in f.readlines()[1:]: line = line.split("\t") subsets.add(line[-3]) subsets = [x for x in subsets] for label in labels: for subset in subsets: name = "hans_{}_{}".format(label, subset) examples = load_hans(filter_label=label, filter_subset=subset) hans_datasets.append((name, examples)) return hans_datasets
def dummy(): skipped = 0 for raw_url in args.urls: for url in (yield youku.Youku.get_videos(raw_url)): print(url) continue skipped += 1 if skipped <= args.skip: continue data = yield youku.Youku.get_video_name_and_download_urls(url) directory = data[0].replace('/', '_') output_basename = directory if os.path.exists(output_basename + '.flv') or os.path.exists(output_basename + '.mp4'): continue print('Downloading %s' % directory) urls = data[1] if not os.path.exists(directory): os.mkdir(directory) process = tqdm.tqdm(range(len(urls)), leave=True, mininterval=0) template = '%%0%dd.%%s' % math.ceil( decimal.Decimal(len(urls)).log10()) video_files = [] for i, durl in enumerate(urls): file_suffix = re.search(r'st/(\w+)/fileid', durl).group(1) try: next(process) except StopIteration: pass path = os.path.join(directory, template % ((i + 1), file_suffix)) video_files.append(path) yield utils.download_to_file(path, durl) else: try: next(process) except StopIteration: pass utils.merge_videos(video_files, output_basename) shutil.rmtree(directory) sys.stderr.write('\n')
def dummy(): skipped = 0 for raw_url in args.urls: for url in (yield youku.Youku.get_videos(raw_url)): print(url) continue skipped += 1 if skipped <= args.skip: continue data = yield youku.Youku.get_video_name_and_download_urls(url) directory = data[0].replace('/', '_') output_basename = directory if os.path.exists(output_basename + '.flv') or os.path.exists(output_basename + '.mp4'): continue print('Downloading %s' % directory) urls = data[1] if not os.path.exists(directory): os.mkdir(directory) process = tqdm.tqdm(range(len(urls)), leave=True, mininterval=0) template = '%%0%dd.%%s' % math.ceil(decimal.Decimal(len(urls)).log10()) video_files = [] for i, durl in enumerate(urls): file_suffix = re.search(r'st/(\w+)/fileid', durl).group(1) try: next(process) except StopIteration: pass path = os.path.join(directory, template % ((i + 1), file_suffix)) video_files.append(path) yield utils.download_to_file(path, durl) else: try: next(process) except StopIteration: pass utils.merge_videos(video_files, output_basename) shutil.rmtree(directory) sys.stderr.write('\n')
def collect(): for tag in tags: utils.download_to_file(github_api + tag, utils.basedir() + "downloads/raw/" + utils.today() + '-' + tag + ".json")
def new_beta_job(bot, _): if not config.jobs.beta.enabled: logger.info('android beta job is disabled, exiting job') return logger.info('starting android beta job...') try: with open(config.jobs.beta.build_number_file, 'r') as f: latest_build_number = f.read().strip() except FileNotFoundError: latest_build_number = '-1' latest_build_number = int(latest_build_number) logger.info('last posted build: %d', latest_build_number) logger.info('executing request...') page_content = requests.get(config.jobs.beta.url) tree = html.fromstring(page_content.content) version_string = tree.xpath( '/html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/h3')[0].text logger.info('scraped site version: %s', version_string) version_match = re.search(r'Version\s([0-9 .]+)\s\(([0-9]+)\)$', version_string, re.I) app_version, build_number = version_match.group(1), version_match.group(2) logger.info('scraped app version: %s; scraped build number: %s', app_version, build_number) build_number = int(build_number) if build_number == latest_build_number: logger.info('build_number == latest_build_number (%d == %d)', build_number, latest_build_number) return else: logger.info( 'scraped build number is different from the last posted one') apk_name = 'beta_{}_{}.apk'.format(app_version, build_number) logger.info('apk_name: %s', apk_name) soup = BeautifulSoup(page_content.text, 'html.parser') download_url = u.bs_find_first(soup, 'a') apk_path = u.download_to_file(download_url, apk_name) logger.info('apk_path: %s', apk_path) logger.info('getting md5/sha1...') md5, sha1 = None, None try: md5, sha1 = u.get_md5_sha1(apk_path) except Exception as e: error_string = str(e) logger.error('error while getting md5/sha1: %s', error_string, exc_info=True) bot.send_message(config.telegram.admins[0], 'Error while generating md5/sha1: ' + error_string) caption = NEW_BETA_CAPTION.format(app_version=app_version, build_number=build_number) logger.info('sending apk file') try: with open(apk_path, 'rb') as f: logger.info('reading and sending the APK...') sent_document = bot.send_document(config.jobs.beta.channel_id, f, caption=caption, parse_mode=ParseMode.HTML, timeout=300) logger.info('apk sent, removing file...') os.remove(apk_path) logger.info('saving last posted build number...') with open(config.jobs.beta.build_number_file, 'w+') as f: f.write(str(build_number)) except Exception as e: error_string = str(e) logger.error('error while sending the apk: %s', error_string, exc_info=True) bot.send_message(config.telegram.admins[0], 'Error while sending apk: ' + error_string) return if md5 or sha1: # send them in a separate message text = NEW_BETA_HASHES.format(md5=md5 or 'error', sha1=sha1 or 'error') bot.send_message(config.jobs.beta.channel_id, text, parse_mode=ParseMode.HTML, disable_web_page_preview=True) if config.jobs.beta.notify_channel_id: # notify in the main channel that a new beta has been released bot.send_message( config.jobs.beta.notify_channel_id, 'New Android Beta released: https://t.me/{}/{}'.format( sent_document.chat.username, sent_document.message_id), disable_web_page_preview=True) logger.info('job finished')
def assets_job(bot, _): logger.info('running assets job at %s...', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) if config.jobs.github.disable_assets: logger.info('assets job is disabled, exiting job') return # assets job: don't send messages to Matrix sender = Sender(bot, matrix_client=None) for repo_desc, repo_data in repos.repos.items(): if not repo_data.releases or not repo_data.assets or not repo_data.chat_id: continue repo_name = repo_data.path logger.info('extracting latest release record for %s...', repo_desc) query = (Release.select().where( Release.repository == repo_name, Release.added_on.is_null(False)).order_by( Release.added_on.desc()).limit(1)) if not query: logger.info( 'no release found for repo %s, continuing to next repo', repo_name) continue release = query[0] logger.info('repo %s latest release: %d, added on: %s', repo_name, release.release_id, str(release.added_on)) if release.checked: logger.info( 'we already checked release %d of repo %s, continuing to next repo', release.release_id, repo_name) continue # not all the repo have that attribute assets_timedelta = repo_data.get('assets_timedelta', config.jobs.github.assets_timedelta) # wait at least an hour before checking the assets tdelta = datetime.now() - release.added_on seconds_since_release = tdelta.total_seconds() if seconds_since_release < assets_timedelta: logger.info( 'time check: too soon to check assets, elapsed seconds: %d of %d', seconds_since_release, assets_timedelta) continue logger.info( 'time check: time to check assets, elapsed seconds: %d of %d', seconds_since_release, assets_timedelta) # mark the release as checked. We will check later whether to send download urls/files according to config logger.info('marking release as checked...') release.checked = True release.save() logger.info('getting github repo object...') try: repo = g.get_repo(repo_name) except UnknownObjectException as e: logger.error('error while getting repo %s: %s', repo_name, str(e)) continue logger.info('getting github release object...') gh_release = repo.get_release(release.release_id) logger.info('getting release assets...') assets = gh_release.get_assets() logger.info('%d assets found', len(list(assets))) assets_urls_list = [] for asset in assets: assets_urls_list.append( ASSET_STRING.format(asset_download=asset.browser_download_url, asset_label=asset.label or 'no label')) if not assets_urls_list: logger.info('no asset to send, continuing to new repo...') continue assets_list_text = '<b>Assets for release</b> <code>{}</code> <b>of {}</b>:\n\n{}'.format( gh_release.tag_name, repo_data.path, '\n'.join(assets_urls_list)) assets_list_text = append_hashtag(assets_list_text, repo_data.hashtag) assets_message, _ = sender.send_message(repo_data, assets_list_text) if not repo_data.asset_files: logger.info( 'skipping assets sending as per configuration (release has been marked as checked)' ) continue for asset in assets: logger.info('downloading asset %s...', asset.name) try: file_path = u.download_to_file(asset.browser_download_url, asset.name) except Exception as e: logger.error('error while downloading asset %s: %s', asset.name, str(e), exc_info=True) continue try: md5, sha1 = u.get_md5_sha1(file_path) except Exception as e: logger.error( 'error while generating md5/sha1 for asset %s: %s', asset.name, str(e), exc_info=True) continue caption = CAPTION.format(md5=md5, sha1=sha1, asset_label=asset.label or 'non-labeled asset') logger.info('sending asset %s...', asset.name) try: with open(file_path, 'rb') as f: assets_message.reply_document(f, caption=caption, parse_mode=ParseMode.HTML, timeout=300) except Exception as e: logger.error('error while sending the asset %s: %s', asset.name, str(e), exc_info=True) continue logger.info('removing file %s...', file_path) os.remove(file_path) release.sent = True release.save() logger.info('job finished')
def collect(): for key, url in urls.items(): utils.download_to_file( url, utils.basedir() + "docker/raw/" + utils.today() + '-' + key + ".json")