Beispiel #1
0
def collect():
    headers = {'Authorization': 'Bearer ' + os.getenv('BITLY_TOKEN')}
    for link in links:
        utils.download_to_file(
            bitly_clicks_api.replace('{{LINK}}', link),
            utils.basedir() + "bitly/raw/" + utils.today() + '-' + link +
            ".json", headers)

    for link_cs in links_for_country_stats:
        utils.download_to_file(
            bitly_countries_api.replace('{{LINK}}', link_cs),
            utils.basedir() + "bitly/raw-countries/" + utils.today() + '-' +
            link_cs + ".json", headers)
def load_hans(n_samples=None,
              filter_label=None,
              filter_subset=None) -> List[TextPairExample]:
    out = []

    if filter_label is not None and filter_subset is not None:
        logging.info("Loading hans subset: {}-{}...".format(
            filter_label, filter_subset))
    else:
        logging.info("Loading hans all...")

    src = join(config.HANS_SOURCE, "heuristics_evaluation_set.txt")
    if not exists(src):
        logging.info("Downloading source to %s..." % config.HANS_SOURCE)
        utils.download_to_file(HANS_URL, src)

    with open(src, "r") as f:
        f.readline()
        lines = f.readlines()

    if n_samples is not None:
        lines = np.random.RandomState(16349 + n_samples).choice(lines,
                                                                n_samples,
                                                                replace=False)

    for line in lines:
        parts = line.split("\t")
        label = parts[0]

        if filter_label is not None and filter_subset is not None:
            if label != filter_label or parts[-3] != filter_subset:
                continue

        if label == "non-entailment":
            label = 0
        elif label == "entailment":
            label = 1
        else:
            raise RuntimeError()
        s1, s2, pair_id = parts[5:8]
        out.append(TextPairExample(pair_id, s1, s2, label))
    return out
def load_hans_subsets():
    src = join(config.HANS_SOURCE, "heuristics_evaluation_set.txt")
    if not exists(src):
        logging.info("Downloading source to %s..." % config.HANS_SOURCE)
        utils.download_to_file(HANS_URL, src)

    hans_datasets = []
    labels = ["entailment", "non-entailment"]
    subsets = set()
    with open(src, "r") as f:
        for line in f.readlines()[1:]:
            line = line.split("\t")
            subsets.add(line[-3])
    subsets = [x for x in subsets]

    for label in labels:
        for subset in subsets:
            name = "hans_{}_{}".format(label, subset)
            examples = load_hans(filter_label=label, filter_subset=subset)
            hans_datasets.append((name, examples))

    return hans_datasets
Beispiel #4
0
 def dummy():
     skipped = 0
     for raw_url in args.urls:
         for url in (yield youku.Youku.get_videos(raw_url)):
             print(url)
             continue
             skipped += 1
             if skipped <= args.skip:
                 continue
             data = yield youku.Youku.get_video_name_and_download_urls(url)
             directory = data[0].replace('/', '_')
             output_basename = directory
             if os.path.exists(output_basename +
                               '.flv') or os.path.exists(output_basename +
                                                         '.mp4'):
                 continue
             print('Downloading %s' % directory)
             urls = data[1]
             if not os.path.exists(directory):
                 os.mkdir(directory)
             process = tqdm.tqdm(range(len(urls)),
                                 leave=True,
                                 mininterval=0)
             template = '%%0%dd.%%s' % math.ceil(
                 decimal.Decimal(len(urls)).log10())
             video_files = []
             for i, durl in enumerate(urls):
                 file_suffix = re.search(r'st/(\w+)/fileid', durl).group(1)
                 try:
                     next(process)
                 except StopIteration:
                     pass
                 path = os.path.join(directory,
                                     template % ((i + 1), file_suffix))
                 video_files.append(path)
                 yield utils.download_to_file(path, durl)
             else:
                 try:
                     next(process)
                 except StopIteration:
                     pass
                 utils.merge_videos(video_files, output_basename)
                 shutil.rmtree(directory)
                 sys.stderr.write('\n')
 def dummy():
     skipped = 0
     for raw_url in args.urls:
         for url in (yield youku.Youku.get_videos(raw_url)):
             print(url)
             continue
             skipped += 1
             if skipped <= args.skip:
                 continue
             data = yield youku.Youku.get_video_name_and_download_urls(url)
             directory = data[0].replace('/', '_')
             output_basename = directory
             if os.path.exists(output_basename + '.flv') or os.path.exists(output_basename + '.mp4'):
                 continue
             print('Downloading %s' % directory)
             urls = data[1]
             if not os.path.exists(directory):
                 os.mkdir(directory)
             process = tqdm.tqdm(range(len(urls)), leave=True, mininterval=0)
             template = '%%0%dd.%%s' % math.ceil(decimal.Decimal(len(urls)).log10())
             video_files = []
             for i, durl in enumerate(urls):
                 file_suffix = re.search(r'st/(\w+)/fileid', durl).group(1)
                 try:
                     next(process)
                 except StopIteration:
                     pass
                 path = os.path.join(directory, template % ((i + 1), file_suffix))
                 video_files.append(path)
                 yield utils.download_to_file(path, durl)
             else:
                 try:
                     next(process)
                 except StopIteration:
                     pass
                 utils.merge_videos(video_files, output_basename)
                 shutil.rmtree(directory)
                 sys.stderr.write('\n')
Beispiel #6
0
def collect():
    for tag in tags:
        utils.download_to_file(github_api + tag, utils.basedir() + "downloads/raw/" + utils.today() + '-' + tag + ".json")
Beispiel #7
0
def new_beta_job(bot, _):
    if not config.jobs.beta.enabled:
        logger.info('android beta job is disabled, exiting job')
        return

    logger.info('starting android beta job...')

    try:
        with open(config.jobs.beta.build_number_file, 'r') as f:
            latest_build_number = f.read().strip()
    except FileNotFoundError:
        latest_build_number = '-1'

    latest_build_number = int(latest_build_number)

    logger.info('last posted build: %d', latest_build_number)

    logger.info('executing request...')
    page_content = requests.get(config.jobs.beta.url)
    tree = html.fromstring(page_content.content)

    version_string = tree.xpath(
        '/html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/h3')[0].text
    logger.info('scraped site version: %s', version_string)

    version_match = re.search(r'Version\s([0-9 .]+)\s\(([0-9]+)\)$',
                              version_string, re.I)
    app_version, build_number = version_match.group(1), version_match.group(2)
    logger.info('scraped app version: %s; scraped build number: %s',
                app_version, build_number)

    build_number = int(build_number)

    if build_number == latest_build_number:
        logger.info('build_number == latest_build_number (%d == %d)',
                    build_number, latest_build_number)
        return
    else:
        logger.info(
            'scraped build number is different from the last posted one')

    apk_name = 'beta_{}_{}.apk'.format(app_version, build_number)
    logger.info('apk_name: %s', apk_name)

    soup = BeautifulSoup(page_content.text, 'html.parser')
    download_url = u.bs_find_first(soup, 'a')

    apk_path = u.download_to_file(download_url, apk_name)
    logger.info('apk_path: %s', apk_path)

    logger.info('getting md5/sha1...')
    md5, sha1 = None, None
    try:
        md5, sha1 = u.get_md5_sha1(apk_path)
    except Exception as e:
        error_string = str(e)
        logger.error('error while getting md5/sha1: %s',
                     error_string,
                     exc_info=True)
        bot.send_message(config.telegram.admins[0],
                         'Error while generating md5/sha1: ' + error_string)

    caption = NEW_BETA_CAPTION.format(app_version=app_version,
                                      build_number=build_number)

    logger.info('sending apk file')
    try:
        with open(apk_path, 'rb') as f:
            logger.info('reading and sending the APK...')
            sent_document = bot.send_document(config.jobs.beta.channel_id,
                                              f,
                                              caption=caption,
                                              parse_mode=ParseMode.HTML,
                                              timeout=300)
        logger.info('apk sent, removing file...')
        os.remove(apk_path)

        logger.info('saving last posted build number...')
        with open(config.jobs.beta.build_number_file, 'w+') as f:
            f.write(str(build_number))
    except Exception as e:
        error_string = str(e)
        logger.error('error while sending the apk: %s',
                     error_string,
                     exc_info=True)
        bot.send_message(config.telegram.admins[0],
                         'Error while sending apk: ' + error_string)

        return

    if md5 or sha1:
        # send them in a separate message
        text = NEW_BETA_HASHES.format(md5=md5 or 'error', sha1=sha1 or 'error')
        bot.send_message(config.jobs.beta.channel_id,
                         text,
                         parse_mode=ParseMode.HTML,
                         disable_web_page_preview=True)

    if config.jobs.beta.notify_channel_id:
        # notify in the main channel that a new beta has been released
        bot.send_message(
            config.jobs.beta.notify_channel_id,
            'New Android Beta released: https://t.me/{}/{}'.format(
                sent_document.chat.username, sent_document.message_id),
            disable_web_page_preview=True)

    logger.info('job finished')
Beispiel #8
0
def assets_job(bot, _):
    logger.info('running assets job at %s...',
                datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    if config.jobs.github.disable_assets:
        logger.info('assets job is disabled, exiting job')
        return

    # assets job: don't send messages to Matrix
    sender = Sender(bot, matrix_client=None)

    for repo_desc, repo_data in repos.repos.items():
        if not repo_data.releases or not repo_data.assets or not repo_data.chat_id:
            continue

        repo_name = repo_data.path
        logger.info('extracting latest release record for %s...', repo_desc)
        query = (Release.select().where(
            Release.repository == repo_name,
            Release.added_on.is_null(False)).order_by(
                Release.added_on.desc()).limit(1))

        if not query:
            logger.info(
                'no release found for repo %s, continuing to next repo',
                repo_name)
            continue

        release = query[0]

        logger.info('repo %s latest release: %d, added on: %s', repo_name,
                    release.release_id, str(release.added_on))
        if release.checked:
            logger.info(
                'we already checked release %d of repo %s, continuing to next repo',
                release.release_id, repo_name)
            continue

        # not all the repo have that attribute
        assets_timedelta = repo_data.get('assets_timedelta',
                                         config.jobs.github.assets_timedelta)

        # wait at least an hour before checking the assets
        tdelta = datetime.now() - release.added_on
        seconds_since_release = tdelta.total_seconds()
        if seconds_since_release < assets_timedelta:
            logger.info(
                'time check: too soon to check assets, elapsed seconds: %d of %d',
                seconds_since_release, assets_timedelta)
            continue

        logger.info(
            'time check: time to check assets, elapsed seconds: %d of %d',
            seconds_since_release, assets_timedelta)

        # mark the release as checked. We will check later whether to send download urls/files according to config
        logger.info('marking release as checked...')
        release.checked = True
        release.save()

        logger.info('getting github repo object...')
        try:
            repo = g.get_repo(repo_name)
        except UnknownObjectException as e:
            logger.error('error while getting repo %s: %s', repo_name, str(e))
            continue

        logger.info('getting github release object...')
        gh_release = repo.get_release(release.release_id)

        logger.info('getting release assets...')
        assets = gh_release.get_assets()
        logger.info('%d assets found', len(list(assets)))

        assets_urls_list = []
        for asset in assets:
            assets_urls_list.append(
                ASSET_STRING.format(asset_download=asset.browser_download_url,
                                    asset_label=asset.label or 'no label'))

        if not assets_urls_list:
            logger.info('no asset to send, continuing to new repo...')
            continue

        assets_list_text = '<b>Assets for release</b> <code>{}</code> <b>of {}</b>:\n\n{}'.format(
            gh_release.tag_name, repo_data.path, '\n'.join(assets_urls_list))
        assets_list_text = append_hashtag(assets_list_text, repo_data.hashtag)
        assets_message, _ = sender.send_message(repo_data, assets_list_text)

        if not repo_data.asset_files:
            logger.info(
                'skipping assets sending as per configuration (release has been marked as checked)'
            )
            continue

        for asset in assets:
            logger.info('downloading asset %s...', asset.name)
            try:
                file_path = u.download_to_file(asset.browser_download_url,
                                               asset.name)
            except Exception as e:
                logger.error('error while downloading asset %s: %s',
                             asset.name,
                             str(e),
                             exc_info=True)
                continue

            try:
                md5, sha1 = u.get_md5_sha1(file_path)
            except Exception as e:
                logger.error(
                    'error while generating md5/sha1 for asset %s: %s',
                    asset.name,
                    str(e),
                    exc_info=True)
                continue

            caption = CAPTION.format(md5=md5,
                                     sha1=sha1,
                                     asset_label=asset.label
                                     or 'non-labeled asset')
            logger.info('sending asset %s...', asset.name)
            try:
                with open(file_path, 'rb') as f:
                    assets_message.reply_document(f,
                                                  caption=caption,
                                                  parse_mode=ParseMode.HTML,
                                                  timeout=300)
            except Exception as e:
                logger.error('error while sending the asset %s: %s',
                             asset.name,
                             str(e),
                             exc_info=True)
                continue

            logger.info('removing file %s...', file_path)
            os.remove(file_path)

        release.sent = True
        release.save()

    logger.info('job finished')
Beispiel #9
0
def collect():
    for key, url in urls.items():
        utils.download_to_file(
            url,
            utils.basedir() + "docker/raw/" + utils.today() + '-' + key +
            ".json")