def run_test(test_repo: Dolt, test: str) -> str: # ensure table is removed if TEST_TABLE in [t.name for t in test_repo.ls()]: test_repo.table_rm(TEST_TABLE) sysbench_args = [ 'sysbench', test, '--table-size=1000000', '--db-driver=mysql', '--mysql-db={}'.format(test_repo.repo_name), '--mysql-user=root', '--mysql-host=127.0.0.1', ] # Prepare the test prepare_exitcode, prepare_output = _execute(sysbench_args + ['prepare'], os.getcwd()) if prepare_exitcode != 0: logger.error(prepare_output) raise SysbenchFailureException(test, 'prepare', prepare_output) # Run the test run_exitcode, run_output = _execute(sysbench_args + ['run'], os.getcwd()) if run_exitcode != 0: logger.error(run_output) raise SysbenchFailureException(test, 'run', prepare_output) return run_output
def commitData(repo: Dolt, table: str, message: str) -> bool: # Check to ensure changes need to be added first if not repo.status().is_clean: repo.add(table) repo.commit(message) return True return False
def addTweetToDatabase(repo: Dolt, table: str, data: dict): # TODO: Figure Out If Tweet Still Accessible Despite Some Error Messages if 'errors' in data and data['errors'][0]['parameter'] == 'id': errorMessage = archiveErrorMessage(data) create_table = ''' UPDATE {table} set isDeleted="{isDeleted}", json="{json}" where id={id} '''.format(table=table, id=errorMessage['id'], isDeleted=errorMessage['isDeleted'], json=errorMessage['json']) repo.sql(create_table, result_format='csv') return # Tweet Data tweet = extractTweet(data) df = getDataFrame(tweet) # Use `python3 this-script.py --log=VERBOSE` in order to see this output logger.log(VERBOSE, json.dumps(tweet, indent=4)) writeData(repo=repo, table=table, dataFrame=df, requiredKeys=['id'])
def __init__(self, run=None, database: str = ".", branch: str = 'master'): """ Initialize a new context for Dolt operations with Metaflow. run: this is either - a FlowSpec when initialized with a running Flow - a Flow when looking across for data read/written across runs of a Flow - a Run when looking for data read/written by a specific run doltdb_path: this is a path to a location on the filesystem with a Dolt database """ self.run = run self.database = database self.branch = branch self.meta_database = "." self.doltdb = Dolt(self.database) try: self.meta_doltdb = Dolt(os.getcwd()) except: self.meta_doltdb = Dolt.init(os.getcwd()) current_branch, _ = self.doltdb.branch() self.entry_branch = None if current_branch.name != branch: entry_branch = current_branch.name self.doltdb.checkout(branch, checkout_branch=False) self.table_reads = [] self.table_writes = []
def setDeletedStatus(repo: Dolt, table: str, tweet_id: str, deleted: bool): tweets: Table = Table(table) query: QueryBuilder = Query.update(tweets) \ .set(tweets.isDeleted, int(deleted)) \ .where(tweets.id == tweet_id) repo.sql(query=query.get_sql(quote_char=None), result_format='csv')
def initRepo(path: str, create: bool, url: str = None) -> Dolt: # Prepare Repo For Data if create: repo = Dolt.init(path) repo.remote(add=True, name='origin', url=url) return repo return Dolt(path)
def initRepo(self, path: str, create: bool, url: str = None): # Prepare Repo For Data if create: repo = Dolt.init(path) repo.remote(add=True, name='origin', url=url) self.repo: Dolt = repo self.repo: Dolt = Dolt(path)
def setStreamJSON(repo: Dolt, table: str, tweet_id: str, data: dict): sql_converter: conversion.MySQLConverter = conversion.MySQLConverter() escaped_json: str = sql_converter.escape(value=json.dumps(data)) tweets: Table = Table(table) query: QueryBuilder = Query.update(tweets) \ .set(tweets.stream_json, escaped_json) \ .where(tweets.id == tweet_id) repo.sql(query=query.get_sql(quote_char=None), result_format="csv")
def load_to_dolthub(loader_or_loaders: Union[DoltLoader, List[DoltLoader]], clone: bool, push: bool, remote_name: str, remote_url: str, dolt_dir: str = None, dry_run: bool = False): """ This function takes a `DoltLoaderBuilder`, repo and remote settings, and attempts to execute the loaders returned by the builder. :param loader_or_loaders: :param dolt_dir: :param clone: :param push: :param remote_name: :param dry_run: :param remote_url: :return: """ if type(loader_or_loaders) == list: loaders = loader_or_loaders else: loaders = [loader_or_loaders] if clone: assert remote_url, 'If clone is True then remote must be passed' temp_dir = tempfile.mkdtemp() logger.info('Clone is set to true, so ignoring dolt_dir') if clone: logger.info( 'Clone set to True, cloning remote {}'.format(remote_url)) repo = Dolt.clone(remote_url, temp_dir) else: assert os.path.exists(os.path.join( dolt_dir, '.dolt')), 'Repo must exist locally if not cloned' repo = Dolt(dolt_dir) logger.info('''Commencing to load to DoltHub with the following options: - dolt_dir {dolt_dir} - clone {clone} - remote {remote} - push {push} '''.format(dolt_dir=repo.repo_dir, push=push, clone=clone, remote=remote_name)) if not dry_run: for dolt_loader in loaders: branch = dolt_loader(repo) if push: logger.info('Pushing changes to remote {} on branch {}'.format( remote_name, branch)) repo.push(remote_name, branch)
def addMediaFiles(repo: Dolt, table: str, tweet_id: str, data: List[str]): sql_converter: conversion.MySQLConverter = conversion.MySQLConverter() escaped_json: str = sql_converter.escape(value=json.dumps(data)) media: Table = Table(table) query: QueryBuilder = Query.into(media) \ .insert(tweet_id, escaped_json) # query: QueryBuilder = Query.update(media) \ # .set(media.file, escaped_json) \ # .where(media.id == tweet_id) repo.sql(query=query.get_sql(quote_char=None), result_format="csv")
def clone_or_pull_latest(remote_name, checkout_dir): """ Clones the remote to the specified location if checkout_dir/.dolt does not exist, pulls the latest otherwise :param remote_name: :param checkout_dir: :return: """ if os.path.exists(checkout_dir): repo = Dolt(checkout_dir) repo.pull() return repo else: return clone_repo(remote_name, checkout_dir)
def loader(loader_builder: DoltLoaderBuilder, dolt_dir: str, clone: bool, push: bool, remote_name: str, dry_run: bool, remote_url: str): if clone: assert remote_url, 'If clone is True then remote must be passed' temp_dir = tempfile.mkdtemp() logger.info('Clone is set to true, so ignoring dolt_dir') repo = Dolt(temp_dir) if clone: logger.info( 'Clone set to True, cloning remote {}'.format(remote_url)) repo.clone(remote_url) else: assert os.path.exists(os.path.join( dolt_dir, '.dolt')), 'Repo must exist locally if not cloned' repo = Dolt(dolt_dir) logger.info('''Commencing to load to DoltHub with the following options: - dolt_dir {dolt_dir} - clone {clone} - remote {remote} - push {push} '''.format(dolt_dir=repo.repo_dir, push=push, clone=clone, remote=remote_name)) if not dry_run: loaders = loader_builder() for dolt_loader in loaders: branch = dolt_loader(repo) if push: logger.info('Pushing changes to remote {} on branch {}'.format( remote_name, branch)) repo.push(remote_name, branch)
def _import_and_commit(dolt: Dolt, table: str, data: pd.DataFrame, primary_keys: Optional[List[str]], import_mode: str): dolt_write.import_df(dolt, table, pd.DataFrame(data), primary_keys, import_mode) dolt.add(table) dolt.commit('Executed import on table {} in import mode "{}"'.format( table, import_mode)) commit = dolt.log()[0] return { 'commit_hash': commit.hash, 'timestamp': commit.ts, 'author': commit.author, 'message': commit.message }
def lookupCurrentPresident(repo: Dolt) -> dict: current_time_query = ''' SELECT CURRENT_TIMESTAMP; ''' # I probably shouldn't be hardcoding the value of the query current_time = repo.sql(current_time_query, result_format='csv')[0]['CURRENT_TIMESTAMP()'] logger.debug("Current SQL Time: {}".format(current_time)) current_president_query = ''' select `Twitter User ID`, `Database Name` from presidents where `Start Term`<'{current_date}' and (`End Term`>'{current_date}' or `End Term` is null) limit 1; '''.format(current_date=current_time) return repo.sql(current_president_query, result_format='csv')
def createTableIfNotExists(repo: Dolt, table: str) -> str: columns = ''' `id` bigint unsigned NOT NULL, `date` datetime NOT NULL, `text` longtext NOT NULL, `device` longtext NOT NULL, `favorites` bigint unsigned NOT NULL, `retweets` bigint unsigned NOT NULL, `quoteTweets` bigint unsigned, `replies` bigint unsigned, `isRetweet` tinyint NOT NULL, `isDeleted` tinyint NOT NULL, `repliedToTweetId` bigint unsigned, `repliedToUserId` bigint unsigned, `repliedToTweetDate` datetime, `retweetedTweetId` bigint unsigned, `retweetedUserId` bigint unsigned, `retweetedTweetDate` datetime, `expandedUrls` longtext, `json` longtext, PRIMARY KEY (`id`) ''' settings = ''' ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; ''' create_table = ''' CREATE TABLE IF NOT EXISTS {table} ({columns}) {settings} '''.format(table=table, columns=columns, settings=settings) return repo.sql(create_table, result_format='csv')
def handle_api(self): # Repo repo: Dolt = Dolt(config.ARCHIVE_TWEETS_REPO_PATH) table: str = config.ARCHIVE_TWEETS_TABLE # Determine Reply and Send It send_reply(self=self, repo=repo, table=table)
def analyze_tweet(api: twitter.Api, status: twitter.models.Status, INFO_QUIET: int = logging.INFO + 1, VERBOSE: int = logging.DEBUG - 1): status_text = "12:00 A.M. on the Great Election Fraud of 2020!" # status.full_text # This Variable Is Useful For Debugging Search Queries And Exploits original_phrase = get_search_keywords(text=status_text, search_word_query='analyze') repo: Dolt = Dolt(config.ARCHIVE_TWEETS_REPO_PATH) phrase = convert_search_to_query(phrase=original_phrase) search_results = database.search_tweets(search_phrase=phrase, repo=repo, table=config.ARCHIVE_TWEETS_TABLE) # Instantiate Text Processor analyzer: HostilityAnalysis = HostilityAnalysis(logger_param=logger, verbose_level=VERBOSE) # Load Tweets To Analyze for result in search_results: logger.log(VERBOSE, "Adding Tweet For Processing: {tweet_id} - {tweet_text}".format(tweet_id=result["id"], tweet_text=result["text"])) analyzer.add_tweet_to_process(result) analyzer.preprocess_tweets() analyzer.process_tweets()
def load_to_dolt(repo: Dolt, dolt_table: str, dolt_pks: List[str], state_metadata: StateMetadata, vote_file_builder: VoteFileBuilder, table_data_builder: TableDataBuilder): """ Load to the dolt dir/table specified using given columns for primary keys. :param repo: :param dolt_table: :param dolt_pks: :param state_metadata: :param vote_file_builder: :param table_data_builder: :return: """ logger.info('''Loading data for state {}: - dolt_dir : {} - dolt_table : {} - dolt_pks : {} '''.format(state_metadata.state, repo.repo_dir(), dolt_table, dolt_pks)) table_data = files_to_table_data(state_metadata, vote_file_builder, table_data_builder) import_list(repo, dolt_table, table_data, dolt_pks, import_mode='update', batch_size=100000)
def count_tweets(search_phrase: str, repo: Dolt, table: str, account_id: Optional[int] = None, hide_deleted_tweets: bool = False, only_deleted_tweets: bool = False) -> int: tweets: Table = Table(table) query: QueryBuilder = Query.from_(tweets) \ .select(Count(tweets.id)) \ .orderby(tweets.id, order=Order.desc) \ .where(Lower(tweets.text).like( search_phrase.lower() ) # TODO: lower(text) COLLATE utf8mb4_unicode_ci like lower('{search_phrase}') ) if account_id is not None: # Show Results For Specific Account query: QueryBuilder = query.where(tweets.twitter_user_id == account_id) if hide_deleted_tweets: # Filter Out Deleted Tweets query: QueryBuilder = query.where(tweets.isDeleted == 0) elif only_deleted_tweets: # Only Show Deleted Tweets query: QueryBuilder = query.where(tweets.isDeleted == 1) # Perform Count Query count_result = repo.sql(query=query.get_sql(quote_char=None), result_format="json")["rows"] # Retrieve Count of Tweets From Search for header in count_result[0]: return count_result[0][header] return -1
def write_vote(election_year: int, stage: str, precinct: str, county: str, state: str, jurisdiction: str, candidate: str, party: str, writein: int, office: str, vote_mode: str, votes: int): logging.debug("-------------------------") logging.debug(f"Election Year: {election_year}") logging.debug(f"Stage: {stage.lower()}") logging.debug(f"Precinct: {precinct.upper()}") logging.debug(f"County: {county.upper()}") logging.debug(f"State: {state.upper()}") logging.debug(f"Jurisdiction: {jurisdiction.upper()}") logging.debug(f"Candidate: {candidate.upper()}") logging.debug(f"Party: {party.upper()}") logging.debug(f"Write-In: {writein}") logging.debug(f"Office: {office.upper()}") logging.debug(f"Candidate: {candidate.upper()}") logging.debug(f"Vote Mode: {vote_mode.upper()}") logging.debug(f"Votes: {votes}") logging.debug("-------------------------") # election_year,stage,precinct,county,state,jurisdiction,candidate,party,writein,office,vote_mode,votes insert_statement = f''' replace into vote_tallies (election_year,stage,precinct,county,state,jurisdiction,candidate,party,writein,office,vote_mode,votes) values ("{election_year}", "{stage.lower()}", "{precinct.upper()}", "{county.upper()}", "{state.upper()}", "{jurisdiction.upper()}", "{candidate.upper()}", "{party.upper()}", "{writein}", "{office.upper()}", "{vote_mode.upper()}", "{votes}"); ''' repo: Dolt = Dolt('working/us-president-precinct-results') try: # repo.sql(insert_statement) sql_statements_file.write(f"{insert_statement}\n") except DoltException as e: logging.error(f"Insertion Exception: {e}")
def read_table(self, table_name: str, commit: str = None, flow_name: str = None, run_id: str = None) -> pd.DataFrame: """ Returns the specified tables as a DataFrame. """ if not current.is_running_flow: raise ValueError("read_table is only supported in a running Flow") read_meta = self._get_table_read(table_name) if commit: table = self._get_dolt_table_asof(self.doltdb, table_name, commit) read_meta.commit = commit elif flow_name and run_id: df = read_table_sql(self.meta_doltdb, _get_actions_query(flow_name, run_id, 'read')) database = df.database.values[0] commit = df.commit.values[0] # checkout database and get table ASOF commit db = Dolt(database) table = self._get_dolt_table_asof(db, table_name, commit) read_meta.commit = commit else: table = read_table(self.doltdb, table_name) read_meta.commit = self._get_latest_commit_hash() self.table_reads.append(read_meta) return table
def retrieveAccountInfo(repo: Dolt, account_id: int) -> dict: government: Table = Table("government") query: QueryBuilder = Query.from_(government) \ .select(Star()) \ .where(government.twitter_user_id == account_id) return repo.sql(query=query.get_sql(quote_char=None), result_format='json')["rows"]
def lookupActiveAccounts(repo: Dolt) -> dict: government: Table = Table("government") query: QueryBuilder = Query.from_(government) \ .select(government.twitter_user_id, government.first_name, government.last_name) \ .where(government.archived == 0) return repo.sql(query=query.get_sql(quote_char=None), result_format='json')["rows"]
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--dolt-path', help='The location of the Dolt database on the server machine') args = parser.parse_args() global DOLT DOLT = Dolt(args.dolt_path) app.run()
def retrieveMissingBroadcastInfo(repo: Dolt, table: str) -> dict: tweets: Table = Table(table) query: QueryBuilder = Query.from_(tweets) \ .select(tweets.id, tweets.expandedUrls) \ .where(tweets.expandedUrls.like("https://twitter.com/i/broadcasts/%")) \ .where(tweets.broadcast_json.isnull()) return repo.sql(query=query.get_sql(quote_char=None), result_format='json')["rows"]
def _server_helper(repo: Dolt, request): repo.sql_server() @retry(delay=2, tries=10, exceptions=( sqlalchemy.exc.OperationalError, sqlalchemy.exc.DatabaseError, sqlalchemy.exc.InterfaceError, )) def verify_connection(): conn = repo.get_engine().connect() conn.close() return repo.get_engine() def finalize(): if repo.server: repo.sql_server_stop() verify_connection() request.addfinalizer(finalize)
def createTableIfNotExists(repo: Dolt, table: str): query: CreateQueryBuilder = Query.create_table(table=table) \ .columns( Column("id", "bigint unsigned", nullable=False), Column("twitter_user_id", "bigint unsigned", nullable=False), Column("date", "datetime", nullable=False), Column("text", "longtext", nullable=False), Column("device", "longtext", nullable=False), Column("favorites", "bigint unsigned", nullable=False), Column("retweets", "bigint unsigned", nullable=False), Column("quoteTweets", "bigint unsigned"), Column("replies", "bigint unsigned"), Column("isRetweet", "tinyint", nullable=False), Column("isDeleted", "tinyint", nullable=False), Column("repliedToTweetId", "bigint unsigned"), Column("repliedToUserId", "bigint unsigned"), Column("repliedToTweetDate", "datetime"), Column("retweetedTweetId", "bigint unsigned"), Column("retweetedUserId", "bigint unsigned"), Column("retweetedTweetDate", "datetime"), Column("expandedUrls", "longtext"), Column("json", "longtext"), Column("json_v1", "longtext"), Column("notes", "longtext") ).primary_key("id") # TODO: Figure Out How To Add The Below Parameters # -------------------------------------------------------------------------------------------------------------- # KEY `twitter_user_id_idx` (`twitter_user_id`), # CONSTRAINT `twitter_user_id_ref` FOREIGN KEY (`twitter_user_id`) REFERENCES `government` (`twitter_user_id`) # -------------------------------------------------------------------------------------------------------------- # ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; # -------------------------------------------------------------------------------------------------------------- repo.sql(query=query.get_sql(quote_char=None), result_format="csv")
def loader(loader_builder: DoltLoaderBuilder, dolt_dir: str, dry_run: bool): logger.info( '''Commencing load to Dolt with the following options: - dolt_dir {dolt_dir} '''.format(dolt_dir=dolt_dir) ) if not dry_run: loaders = loader_builder() for dolt_loader in loaders: dolt_loader(Dolt(dolt_dir))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--state', type=str, help='State to load data for', required=True) parser.add_argument('--load-data', ) parser.add_argument('--dolt-dir', type=str, help='Dolt repo directory') parser.add_argument('--start-dolt-server', action='store_true') args = parser.parse_args() repo = Dolt(args.dolt_dir) if args.start_dolt_server: logger.info('start-dolt-server detected, starting server sub process') repo.sql_server(loglevel='trace') state_metadata = build_metadata_helper(args.state) load_to_dolt(repo, 'national_voting_data', VOTING_DATA_PKS, state_metadata, filepath_to_precinct_file, extract_precinct_voting_data)
def write_candidate(candidate: str): insert_statement = f''' insert into candidates (name, fec, fec_name) values (\"{candidate.upper()}\", null, null); ''' repo: Dolt = Dolt('working/us-president-precinct-results') try: # repo.sql(insert_statement) logging.info(f"Added Candidate: {candidate.upper()}") except DoltException as e: None # Silently Fail As Candidate Already In Database