def prime_raw_dataset_from_sql(topic_name="abortion"): global ml_sql_retrieve dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True) encoder = info.features['text'].encoder if ml_sql_retrieve is None: ml_sql_retrieve = SqlStorer(db_name="BillInfoDB", table_name="ml_bill_info_table") ml_sql_retrieve.set_up_connection() ############################ ###PULL REPUBLICAN RESULTS## ############################ republican_results = ml_sql_retrieve.execute_raw_query( raw_query_str=(sql_republican_weight_request % topic_name)) ############################ ####PULL DEMOCRAT RESULTS### ############################ democrat_results = ml_sql_retrieve.execute_raw_query( raw_query_str=(sql_democrat_weight_request % topic_name)) ############################ ####PRIMED RESULTS READY#### ############################ primed_results = prime_sql_results(rep_sql_res=republican_results, dem_sql_res=democrat_results) train_dataset, test_dataset = dataset['train'], dataset['test'] entry_list = [] label_list = [] encoded_entry_list = [] hash_set = set() for entry in primed_results: try: if (entry["content"]): # Only add entry if it is string content = entry["content"].encode('utf-8') # Take a hash of the content to check that if there # are duplicates hash_object = hashlib.md5(content) hash_digest = hash_object.hexdigest() if hash_digest not in hash_set: hash_set.add(hash_digest) # If the content is new (no collision) # add it to the list entry_list.append(entry["content"]) encoded_entry_list.append(encoder.encode(entry["content"])) label_list.append(entry["label"]) except KeyError: continue return train_dataset, test_dataset, encoded_entry_list, label_list, encoder
def request_candidates_bills(): # SQL Retriever poly_voting_record_sql_retriever = SqlStorer( db_name=cand_bills_db_name, table_name=cand_bills_table_name) poly_voting_record_sql_retriever.set_up_connection() # ---------------------- wild_card_query = {} bills_list = [] cand_id = request.args.get("VoteSmartCandID") category_id = request.args.get("VoteSmartPrimaryCategoryId") print("Candidate ID: %s " % cand_id) categories = category_id.strip(',').split(',') print("Category ID: %s " % categories) # Specialized raw request query # raw_query = " SELECT B.*,V.* FROM PoliticianInfo.bill_info_table B \ # inner join PoliticianInfo.politician_voting_record_table V ON \ # B.VoteSmartBillId = V.VoteSmartBillId \ # where V.VoteSmartCandID='%s' and B.VoteSmartPrimaryCategoryId='%s' ORDER BY `DateIntroduced`" % (cand_id, category_id); raw_query = " SELECT B.*,V.* FROM PoliticianInfo.bill_info_table B \ inner join PoliticianInfo.politician_voting_record_table V ON \ B.VoteSmartBillId = V.VoteSmartBillId \ where V.VoteSmartCandID='%s' and (" % (cand_id) # Iterate through category # options appending each to query. for i in range(0, len(categories)): category = categories[i] raw_query += "B.VoteSmartPrimaryCategoryId='%s'" % category if i == (len(categories) - 1): raw_query += ") " break raw_query += " OR " raw_query += "ORDER BY `DateIntroduced`" response = poly_voting_record_sql_retriever.execute_raw_query( raw_query, verbose=False) if response: bills_list = convert_bills_response_2_detailed_dict(response) # return json.dumps(bills_list, default = myconverter) # Fixed date formatting issue using following source: # https://stackoverflow.com/questions/11875770/how-to-overcome-datetime-datetime-not-json-serializable/36142844#36142844 return json.dumps(bills_list, indent=4, sort_keys=True, default=str)