def main(config: dict, output_dir: Path): session = create_session(config['db']['url']) context = config['print_export']['context'] default_tz = dateutil.tz.gettz(config['print_export']['default_tz']) if not default_tz: raise Exception('Invalid time zone') exported_tweets = session.query(ExportedTweet).all() tweets_per_page = config['print_export']['tweets_per_page'] output_filename_template = string.Template( config['print_export']['output_filename_template']) for i in range(len(exported_tweets) // tweets_per_page + 1): start = i * tweets_per_page end = start + tweets_per_page tweets = exported_tweets[start:end] if not tweets: return dict_tweets = [t.to_dict(default_tz) for t in tweets] output_filename = output_filename_template.substitute(i=f'{i:03}', **dict_tweets[0]) output_path = output_dir / output_filename output_path.parent.mkdir(parents=True, exist_ok=True) with output_path.open('w') as f: render_template( f, tweets=dict_tweets, context=context, package=config['print_export'].get('template_package'), path=config['print_export'].get('template_path'), ) session.close()
def main(config: dict): if not config['db'].get('host'): logger.error('Old database is not configured. Nothing to do') return conn = psycopg2.connect( host=config['db']['host'], database=config['db']['database'], user=config['db']['user'], password=config['db']['password'], ) cur = conn.cursor() session = create_session(config['db']['url']) session.configure(autoflush=False, expire_on_commit=False) migrate_page_urls(session, cur, config['db']['table_urls']) migrate_archived_page_urls(session, cur, config['db']['table_archives']) migrate_pages(session, cur, config['db']['table_pages']) migrate_page_lines(session, cur, config['db']['table_lines']) migrate_parsed_page_lines(session, cur, config['db']['table_parsed']) migrate_tweets(session, cur, config['db']['table_reviewed']) migrate_posted_tweets(session, cur, config['db']['table_posted']) migrate_exported_tweets(session, cur, config['db']['table_print_export']) session.close() conn.close()
def main(config: dict): feeds = config['feeds'] dates = [ datetime.datetime.fromisoformat(d) for d in config['archive_dates'] ] session = create_session(config['db']['url']) for feed in feeds: if not feed.get('name') or not feed.get('url'): continue for date in dates: if count( session.query(ArchivedPageURL).filter( ArchivedPageURL.feed_url == feed['url'], ArchivedPageURL.date == date, )): continue archived_url = find_closest_snapshot_url(feed['url'], date) archived_page_url = ArchivedPageURL( feed_url=feed['url'], archived_url=archived_url, date=date, ) session.add(archived_page_url) session.commit() session.close()
def main(config: dict, secrets: dict, interactive: bool, dry_run: bool): session = create_session(config['db']['url']) approved_tweets = session.query(Tweet).filter( Tweet.status == TweetReviewStatus.approved) posted_tweets = session.query(PostedTweet).all() posted_tweets_parsed = [t.parsed for t in posted_tweets] pending_tweets = [ t for t in approved_tweets if t.parsed not in posted_tweets_parsed ] total_approved_tweets = count(approved_tweets) total_posted_tweets = len(posted_tweets) total_pending_tweets = len(pending_tweets) logger.info('Number of approved tweets: %d', total_approved_tweets) logger.info('Number of posted tweets: %d', total_posted_tweets) logger.info('Number of tweets to post: %d', total_pending_tweets) if not total_pending_tweets: logger.warning('Nothing to do, all tweets have already been posted') return i = random.randint(0, total_pending_tweets - 1) tweet = pending_tweets[i] template_str = deep_get(config, ['post_tweet', 'tweet_template'], default='${text} ${url}') text = Template(template_str).substitute(text=tweet.text, url=tweet.url) logger.warning( '%d/%d/%d posting tweet "%s"', i, total_pending_tweets, total_approved_tweets, text, ) if interactive: inp = input('Are you sure you want to post this tweet? [y/N] ') if inp != 'y': print('Bailing out!') return status_id = post_tweet(text, secrets, dry_run) if not status_id: return posted_tweet = PostedTweet.from_tweet(tweet, text, status_id) session.add(posted_tweet) session.commit() name = config['post_tweet']['profile_name'] description = Template( config['post_tweet']['profile_description_template']).substitute( n_posted=total_posted_tweets + 1, n_approved=total_approved_tweets) logger.warning( 'Updating profile, name: "%s", description: "%s"', name, description, ) update_profile(name, description, secrets, dry_run) session.close()
def main( config: dict, cache_path: str, dry_run: bool, ): since = deep_get( config, ['download_pages', 'since'], default=None, process=datetime.datetime.fromisoformat, ) wait_interval = ( deep_get(config, ['download_pages', 'wait_min'], default=0, process=int), deep_get(config, ['download_pages', 'wait_max'], default=0, process=int), ) timeout = deep_get(config, ['download_pages', 'timeout'], default=10, process=int) session = create_session(config['db']['url']) logger.info( 'Selecting pages to download since %s', since or 'the beginning of time', ) page_urls_by_feeds = select_page_urls_to_download(session, since) print_stats(page_urls_by_feeds) page_urls_with_feed_names = {} for feed_name, page_urls in page_urls_by_feeds.items(): for page_url in page_urls: if page_url not in page_urls_with_feed_names: page_urls_with_feed_names[page_url] = feed_name total = len(page_urls_with_feed_names) logger.info('Pages to download: %d', total) if dry_run: logger.warning('This is just a dry run, not downloading any pages') return page_urls_with_feed_names_list = list(page_urls_with_feed_names.items()) random.shuffle(page_urls_with_feed_names_list) for i, (page_url, feed_name) in enumerate(page_urls_with_feed_names_list): logger.info('%d/%d Downloading %s', i + 1, total, page_url) text = download_page_text( cache_path=cache_path, feed_name=feed_name, page_url=page_url, wait_interval=wait_interval, timeout=timeout, ) if text is not None: page = Page(url=page_url, text=text) session.add(page) session.commit() session.close()
def main(config: dict, cache_path: Path, approved: bool = False): session = create_session(config['db']['url']) if approved: tweets = session.query(Tweet).filter( Tweet.status == TweetReviewStatus.approved) else: tweets = session.query(PostedTweet).all() for tweet in tweets: exported_tweet = print_export_tweet(cache_path, tweet) if exported_tweet and not count( session.query(ExportedTweet).filter( ExportedTweet.text == exported_tweet.text)): session.add(exported_tweet) session.flush() session.commit() session.close()
def main(config: dict, cache_path: str): feeds = config['feeds'] timeout = deep_get( config, ['download_feeds', 'timeout'], default=30, process=int ) session = create_session(config['db']['url']) for feed in feeds: if not feed.get('name') or not feed.get('url'): continue for archived_page_url in session.query(ArchivedPageURL).filter( ArchivedPageURL.feed_url == feed['url'], ArchivedPageURL.archived_url.isnot(None), ): logger.info( 'Found archived feed %s %s', archived_page_url.date, archived_page_url.archived_url, ) cache_file_path = ( Path(cache_path) / 'feeds' / safe_filename(archived_page_url.archived_url) / 'feed_pages.csv' ) if cache_file_path.is_file(): logger.info('Reading from cache') with cache_file_path.open('r') as f: page_urls = [ clean_url(page_url) for (page_url,) in csv.reader(f) ] else: try: page_urls = download_feed( archived_page_url.archived_url, timeout=timeout ) except Exception: logger.error( 'Failed to download %s', archived_page_url.archived_url ) cache_file_path.parent.mkdir(parents=True, exist_ok=True) with cache_file_path.open('w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows((page_url,) for page_url in page_urls) save_page_urls( session, feed['name'], page_urls, archived_page_url.date ) session.close()
def main(config: dict, secrets: dict, dry_run: bool): session = create_session(config['db']['url']) api = twitter.Api( consumer_key=secrets['consumer_key'], consumer_secret=secrets['consumer_secret'], access_token_key=secrets['access_token_key'], access_token_secret=secrets['access_token_secret'], tweet_mode='extended', ) screen_name = 'covid_chance' max_pages = 100 max_id = None for _ in range(max_pages): last_id = check_posted_tweets(session, api, screen_name, max_id) if not last_id: break max_id = last_id - 1 session.close()
def main(config: dict): session = create_session(config['db']['url']) feeds = config['feeds'] writer = csv.DictWriter( sys.stdout, fieldnames=( 'feed_name', 'n_pages', 'n_lines', 'n_parsed', 'n_approved', ), quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n', ) writer.writeheader() for feed in feeds: if feed['name']: feed_stats = calc_feed_stats(session, feed['name']) writer.writerow(feed_stats) session.close()
def main(config: dict): feeds = config['feeds'] timeout = deep_get(config, ['download_feeds', 'timeout'], default=30, process=int) session = create_session(config['db']['url']) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit( download_and_save_feed, session, feed_name=feed['name'], feed_url=feed['url'], timeout=timeout, ) for feed in feeds if feed.get('name') and feed.get('url') ] for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as e: logger.error('Exception: %s', e) session.close()
def main(config, review_all: bool, incl_approved: bool): session = create_session(config['db']['url']) parsed_page_lines = session.query(ParsedPageLine).filter( ParsedPageLine.parsed != '') reviewed_tweets = session.query(Tweet).filter( Tweet.status != TweetReviewStatus.none) approved_tweets = [ t for t in reviewed_tweets if t.status == TweetReviewStatus.approved ] rejected_tweets = [ t for t in reviewed_tweets if t.status == TweetReviewStatus.rejected ] if review_all: pending_parsed_page_lines = parsed_page_lines else: reviewed_tweets_parsed = [ tweet.parsed for tweet in session.query(Tweet).filter( Tweet.status != TweetReviewStatus.none) ] pending_parsed_page_lines = parsed_page_lines.filter( ParsedPageLine.parsed.notin_(reviewed_tweets_parsed)) pending_tweets = [ Tweet.from_parsed_page_line(parsed_page_line) for parsed_page_line in pending_parsed_page_lines ] if not review_all: if incl_approved: pending_tweets += approved_tweets else: invalid_approved_tweets = [t for t in approved_tweets if t.invalid] pending_tweets += invalid_approved_tweets total_pending_tweets = len(pending_tweets) logger.info('Number of matching lines: %d', session.query(PageLine).count()) logger.info('Number of parsed tweets: %d', count(parsed_page_lines)) logger.info('Number of approved tweets: %d', len(approved_tweets)) logger.info('Number of rejected tweets: %d', len(rejected_tweets)) logger.info('Number of tweets to review: %d', total_pending_tweets) i = 0 while i < len(pending_tweets): tweet = pending_tweets[i] print_tweet( tweet, i=i + 1, total=total_pending_tweets, highlight=True, ) inp = None while inp is None or (inp not in ('y', 'n', 'e', 'q', 's', 'p', '')): inp = rlinput('Do you like this tweet? ' '"y" or Enter = yes, ' '"n" = no, ' '"e" = edit, ' '"s" = skip (ask next time again), ' '"p" = show previous tweet, ' '"q" = quit \n' '> ') if inp == 'q': break if inp == 's': i = i + 1 continue if inp == 'p': i = max(i - 1, 0) continue if inp in ('y' or ''): tweet.status = TweetReviewStatus.approved elif inp == 'n': tweet.status = TweetReviewStatus.rejected elif inp == 'e': edited_text = None while edited_text is None: edited_text = rlinput( 'Enter new text or delete it to reject the tweet.\n> ', tweet.edited or tweet.parsed, ) tweet.edited = edited_text if edited_text == '': tweet.status = TweetReviewStatus.rejected else: tweet.status = TweetReviewStatus.approved else: raise NotImplementedError('Invalid input') if inspect(tweet).transient: session.add(tweet) session.commit() i = i + 1 session.close()