def main(): lang = "deu" docs_path = "data/nel_" + lang + ".json" events = open_json(docs_path).keys() # save_sim_entity_overlap(articles, lang) # ************************* save similarities # whole_featurs = np.load("features/" + lang + "_features.npy", allow_pickle=True) # last saved file # save_sims_articles(whole_featurs, lang, is_bert=True, is_entity=False) # total_sims_file_name = compute_sims_combinations(whole_featurs, lang, name_postfix='') # *************** compute average prec of retrievals e.g. 'Barack'{ '2323223', 'similarity': {'Barack','8734645', 0.8 } } total_sims_file_name = lang + "_total_similarity_file.json" dic_sims_articles = open_json("similarity/" + total_sims_file_name) keys = dic_sims_articles.keys() avg_event = {} ls_features = [ 'sim_bert', 'sim_entity', 'sim_obj', 'sim_loc', 'sim_scene', 'sim_avg_text', 'sim_avg_visual', 'sim_avg_total' # 'sim_max_text', 'sim_max_visual', 'sim_max_total' ] for selected_feature in ls_features: print("********* " + selected_feature) for event in events: print(event) queries_by_event = [] for k0 in keys: k = k0.split("/")[0] if event == k: queries_by_event.append( dic_sims_articles[k0] ) # collects all queries of this event avg = [] for query, i in zip(queries_by_event, range(len(queries_by_event))): y_true0, scores0 = convert_to_class(event, query, selected_feature) y_true = np.zeros([len(y_true0) - 1]) y_true[:i] = y_true0[:i] y_true[i:] = y_true0[i + 1:] scores = np.zeros([len(y_true0) - 1]) scores[:i] = scores0[:i] scores[i:] = scores0[i + 1:] avg.append(average_precision_score(y_true, scores)) avg_event[event] = np.mean(avg) save_file("similarity/avgs/" + lang + "/" + selected_feature, avg_event) print('')
def create_tagging_schema(directory_source,source,attributes_chosed,raf=False,raf_attribute="",name=""): files = os.listdir(directory_source) source_sentences = [] source_example_counts = dict() print("Tagging al the sentence of source: "+source+" ...") for filename in [file for file in files if file.endswith(".json")]: js = utils.open_json(filename,source) for attribute in js: if raf: if isinstance(js[attribute],str) and attribute!="<page title>" and attribute==raf_attribute: sentence = [] for token in utils.tokenizer(attribute): sentence.append((token,"O")) sentence.append(("ENDNAME","O")) sentence = sentence + tag_sentence(attribute,js[attribute],attributes_chosed,source,raf,raf_attribute,name) sentence.append(("ENDVALUE","O")) if useful(sentence)>0: source_sentences.append((raf_attribute,sentence)) source_example_counts.setdefault(raf_attribute,0) source_example_counts[raf_attribute]=source_example_counts[raf_attribute]+1 else: if isinstance(js[attribute],str) and attribute!="<page title>" and [t for t in d.get_predicate_name(attribute,source,True) if t in attributes_chosed]: sentence = [] for token in utils.tokenizer(attribute): sentence.append((token,"O")) sentence.append(("ENDNAME","O")) sentence = sentence + tag_sentence(attribute,js[attribute],attributes_chosed,source) sentence.append(("ENDVALUE","O")) if useful(sentence)>0: p_name = d.get_predicate_name(attribute,source,True)[0] source_sentences.append((p_name,sentence)) source_example_counts.setdefault(p_name,0) source_example_counts[p_name]=source_example_counts[p_name]+1 return (source_example_counts,source_sentences)
def difference_in_shares(): """Finds the difference in shares. Creates a dict to be used by Output""" company_dict = utils.open_json(MONITOR) share_difference_dict = {} for company in company_dict: share_change = 1.0 - ( company_dict[company]["Initial-share-price"] / company_dict[company]["Current-share-price"]) maximum = 1 - (company_dict[company]["Initial-share-price"] / max(company_dict[company]["Share-price-list"])) share_difference_dict[company] = {} share_difference_dict[company]["Change"] = share_change share_difference_dict[company]["Max"] = max( company_dict[company]["Share-price-list"]) share_difference_dict[company]["Max-change"] = maximum share_difference_dict[company]["Initial"] = company_dict[company][ "Initial-share-price"] share_difference_dict[company]["Current"] = company_dict[company][ "Current-share-price"] return share_difference_dict
def get_initial_company_info(): """Gets the initial information for each company""" company_dict = utils.open_json(MONITOR) for company in company_dict: # Gets symbol for company if company_dict[company]["Symbol"] == "unknown": try: with urllib.request.urlopen( f'https://finance.yahoo.com/_finance_doubledown/' f'api/resource/searchassist;searchTerm={company}' ) as response: html = response.read().decode() d = json.loads(html) company_dict[company]["Symbol"] = d['items'][0][ 'symbol'] except urllib.error.HTTPError as error: utils.write_to_log(f'Error opening URL: {error}') # Gets initial share price if company_dict[company]["Initial-share-price"] == 1: yahoo = Share(company_dict[company]["Symbol"]) share = yahoo.get_price() company_dict[company]["Initial-share-price"] = float(share) company_dict[company]["Current-share-price"] = float(share) utils.write_to_json(MONITOR, company_dict)
def add_question_ids(infile, subject_metadata): question_data = open_json(infile) max_q = 0 for q_id in question_data: subjects = question_data[q_id]['subjects'] new_subject_map = [subject_metadata[d]['new_id'] for d in subjects] child_subjects = [] for d1 in subjects: is_ok = True for d2 in subjects: if d1 == d2: continue if d1 in subject_metadata[d2]['parents']: is_ok = False break if is_ok: child_subjects.append(d1) question_data[q_id]['new_sub_map'] = new_subject_map child_subject_map = [ subject_metadata[d]['new_id'] for d in child_subjects ] question_data[q_id]['child_map'] = child_subject_map question_data[q_id]['childs'] = child_subjects child_whole_map = [] for child in child_subjects: parent = subject_metadata[child]['parents'] parent = [d for d in parent if d] parent = [subject_metadata[d]['new_id'] for d in parent] child_whole_map.append(parent) question_data[q_id]['child_whole_map'] = child_whole_map max_q = max(len(child_whole_map), max_q) print(max_q) dump_json(infile, question_data)
def add_false_examples(set,target): sentences = [] for s in [x[1] for x in os.walk(config["DIRECTORY_DATASET"])][0]: if s!="www.ebay.com" and s!="www.alibaba.com": directory_source = config["DIRECTORY_DATASET"] + s files = os.listdir(directory_source) print(directory_source+"...") for filename in [file for file in files if file.endswith(".json")]: js = utils.open_json(filename,s) sentence = [] for attribute in js: if d.get_predicate_name(attribute,s,False)[0]==target and isinstance(js[attribute],str) and js[attribute]!="Black": for token in utils.tokenizer(attribute): sentence.append((token,"O")) sentence.append(("ENDNAME","O")) for token in utils.tokenizer(js[attribute]): sentence.append((token,"O")) sentence.append(("ENDVALUE","O")) sentences.append(sentence) with open("dataset/"+set+"_set.txt","a") as f: for sentence in sentences: for (token,tag) in sentence: f.write(token+"\t"+tag+"\n") f.write("\n") f.close()
def main(): # Get the contents of the config JSON jcontents = utils.open_json(JSONPATH) auth = (USERNAME, PASSWORD) # Make an empty array to store the client objects client_objects = [] # Iterate through the JSON, and make an object for each client for json_config in jcontents["technician"]["clients"]: new_client = Client(json_config) client_objects.append(new_client) # Connect to the Office 365 inbox, get a list of emails inbox = get_inbox() emails_list = inbox.from_folder('Monthly Reports').fetch_next(50) # magic # Download all the attachments locally # This needs a serious rework, it is very unoptimized! for client_obj in client_objects: for email in emails_list: for report_email in client_obj.client_emails: if email.getSubject() == report_email["subject"]: email.fetchAttachments() if email.attachments[0].save("./pdfs/"): client_obj.downloaded_report_count += 1 else: raise ValueError("File save error!") if client_obj.report_count != client_obj.downloaded_report_count: raise ValueError("Missing File: {}".format(client_obj.client_name)) # Parse each document and make a dict of scores for client in client_objects: sd, ns = soupy.get_score_dict("./pdfs/" + client.client_emails[0]["filename"]) client.set_scores_dict(sd) client.set_network_health_score(ns) # For each client, generate a report based on the scores. for count, client in enumerate(client_objects): #if count == 1: # Comment in and set break point for testing. #break if client.network_score == 0: print("NO MANAGED SERVICES FOR {}!".format(client.client_name)) else: e_email = client.build_email() print("Sending email for: {}... Status: ".format( client.client_name), end="") print(client.send_email(e_email, auth)) sleep(10) return 0
def clean(): args = get_args_clean() import os full_path = os.path.abspath(args["json_file"]) folder = os.path.dirname(full_path) all_imgs = utils.open_json(args["json_file"]) filtered = utils.filter_by_type(os.listdir(folder), "_c.jpg") clean_json = [] for img in all_imgs: for one in filtered: if utils.equal(img["file_name"], one): clean_json.append(img) break utils.save_json(clean_json, args["outfile"])
def source_dictionary(source): d = dict() print("Creating values dictionary for : " + source) for filename in os.listdir(config["DIRECTORY_DATASET"] + source): js = utils.open_json(filename, source) if js != "Not json": for attribute in js: predicate_name = get_predicate_name(attribute, source, False)[0] if isinstance(js[attribute], str) and predicate_name != "Not found": for value in re.split('[\(\);,]', js[attribute]): if value.lstrip() and len(value.lstrip()) > 1: d.setdefault(predicate_name, set()).add(value.lstrip()) return d
def minus_days(): """Takes away a day from the "Days-Left", removes from monitor.json if == 0""" company_dict = utils.open_json(MONITOR) remove = [] for company in company_dict: if company_dict[company]["Days-left"] > 0: company_dict[company]["Days-left"] -= 1 elif company_dict[company]["Days-left"] == 0: remove.append(company) for company in remove: # Do I want to keep a record of all the companies that have been mentioned and their prices??? # Goes here del company_dict[company] utils.write_to_json(MONITOR, company_dict)
def get_current_shares(): """Gets current shares, compares it to initial, finds difference. Returns for output to handle""" company_dict = utils.open_json(MONITOR) for company in company_dict: try: yahoo = Share(company_dict[company]["Symbol"]) yahoo.refresh() share = yahoo.get_price() company_dict[company]["Current-share-price"] = float(share) company_dict[company]["Share-price-list"].append(float(share)) except ValueError: # yahoo.get_price() will return None if an error occurs print("Could not add to the Current share/Share price list") utils.write_to_json(MONITOR, company_dict)
def raf_dict(ta, atomic): values_dict = dict() for s in atomic: source, raf_attribute = s.split("__")[0], ( s.split("__")[1]).split("/")[1] print("Creating values dictionary for {0} and attribute {1}".format( source, raf_attribute)) if source in values_dict.keys(): source_dic = values_dict[source] else: values_dict.setdefault(source, dict()) source_dic = dict() for filename in os.listdir(config["DIRECTORY_DATASET"] + source): js = utils.open_json(filename, source) if js != "Not json": for attribute in js: if attribute == raf_attribute: if isinstance(js[attribute], list): for val in js[attribute]: for value in re.split( '(?<!\d)[.](?!\d)|[\(\)\/;,]', val): if value.lstrip() and len(value.lstrip( )) > 1 and (value.lower().find( "nikon", 0, len(value)) == -1): source_dic.setdefault(ta, set()).add( value.lstrip()) if isinstance(js[attribute], str): for value in re.split( '(?<!\d)[.](?!\d)|[\(\)\/;,]', js[attribute]): if value.lstrip() and len( value.lstrip()) > 1 and ( value.lower().find( "nikon", 0, len(value)) == -1): source_dic.setdefault(ta, set()).add( value.lstrip()) values_dict[source] = source_dic f = open("persistent_files/dizionario.pkl", "wb") pickle.dump(values_dict, f) f.close()
def check_for_companies(self): """Checks list of companies with Trump's tweet seeing if any companies are listed in his tweet. Inputs matches into monitor.json""" matches = [] punc = ("!", ",", ".", ":", ";", "@", "?", "(", ")") self.tweet = ''.join( [letter for letter in self.tweet if letter not in punc]).lower() with open(COMPANIES) as f: companies = [line.strip() for line in f] for word in self.tweet.split(): # Binary search for word if utils.find(companies, word): matches.append(word) company_dict = utils.open_json(MONITOR) comp_d = {} # Information that is needed by get_initial/current for company in matches: comp_d[company] = {} comp_d[company]["Date-mentioned"] = "{:%d-%m-%Y %H:%M:%S}".format( datetime.datetime.now()) comp_d[company]["Mentioned by"] = self.handle comp_d[company]["Tweet"] = self.original_tweet comp_d[company]["Days-left"] = 7 comp_d[company]["Symbol"] = "unknown" comp_d[company]["Initial-share-price"] = 1 comp_d[company]["Current-share-price"] = 1 comp_d[company]["Share-price-list"] = [] company_dict.update(comp_d) utils.write_to_json(MONITOR, company_dict) return matches
def convert_zero_similarities_to_random_retrieval(lang): ls_features = ['sim_bert', 'sim_entity', 'sim_obj', 'sim_loc', 'sim_scene', 'sim_avg_text', 'sim_avg_visual', 'sim_avg_total'] similarity_measures = open_json('similarity/' + lang + '.json') for (key, sims) in similarity_measures.items(): for feat in ls_features: min_val_sim = 1 for s in sims['similarity']: if s[feat] < min_val_sim: if s[feat] != 0: min_val_sim = s[feat] if min_val_sim == 1: # this means min_val=0 min_val_sim = 0.01 for s in sims['similarity']: sfeat = s[feat] if sfeat == 0.0: s[feat] = np.random.random_sample() * min_val_sim save_file('similarity/' + lang + '_random_retrieval_replaced_by_zero.json', similarity_measures)
def featurize(): global student_metadata, df, question_metadata TRAIN_DATA = 'public_data/train_data/train_task_1_2.csv' TEST_DATA = 'starter_kit/submission_templates/submission_task_1_2.csv' ANSWER_DATA = 'public_data/metadata/answer_metadata_task_1_2.csv' QUESTION_SUBJECTS = 'public_data/personal_data/question_metadata_task_1_2.json' STUDENT_FEATURES = 'public_data/personal_data/student_metadata_task_1_2.json' question_metadata = open_json(QUESTION_SUBJECTS) # child map student_metadata = open_json(STUDENT_FEATURES) #AnswerId,DateAnswered,Confidence,GroupId,QuizId,SchemeOfWorkId answer_df = pd.read_csv(ANSWER_DATA)[ ['AnswerId', 'DateAnswered', 'Confidence', 'GroupId', 'QuizId']] answer_df['Confidence'].fillna((answer_df['Confidence'].mean()), inplace=True) answer_df['DateAnswered'] = pd.to_datetime( answer_df['DateAnswered'], errors='coerce') print(answer_df.shape) #QuestionId,UserId,AnswerId,IsCorrect,CorrectAnswer,AnswerValue train_df = pd.read_csv(TRAIN_DATA) train_df['TestMask'] = 1 print('train_df shape: ', train_df.shape) #print(train_df.isnull().values.any()) correct_df = train_df[['QuestionId', 'CorrectAnswer'] ].drop_duplicates('QuestionId') print('correct qs shape: ', correct_df.shape) #,QuestionId,UserId,AnswerId test_df = pd.read_csv(TEST_DATA)[['QuestionId', 'UserId', 'AnswerId']] test_df = pd.merge(test_df, correct_df, on='QuestionId') test_df['IsCorrect'] = 0 test_df['TestMask'] = 0 test_df['AnswerValue'] = 1 print(test_df.shape) #print(test_df.isnull().values.any()) # #get answer id info for train train_merged_df = pd.merge(train_df, answer_df, on='AnswerId') print(train_merged_df.shape) print(train_merged_df.isnull().values.any()) #get answer id info for test test_merged_df = pd.merge(test_df, answer_df, on='AnswerId') print(test_merged_df.shape) print(test_merged_df.isnull().values.any()) df = pd.concat([train_merged_df, test_merged_df], ignore_index=True, sort=False) print(df.shape) user_ids = df['UserId'].unique() user_data = [] start_time = time.time() with Pool(30) as p: user_data = p.map(f, user_ids) end_time = time.time() print(end_time-start_time) print('no of user: '******'public_data/converted_datasets/test_1_2.json', user_data)
#!/usr/bin/env python3 import json import tweepy import datetime import smtplib import time import utils import schedule import urllib.request import urllib.error from yahoo_finance import Share config = utils.open_json("./Files/config.json") # File names LOG = config["Files"]["Log"] EMAILS = config["Files"]["Emails"] TWITTER_NAMES = config["Files"]["Twitter"] COMPANIES = config["Files"]["Companies"] GENERIC = config["Files"]["Generic"] MONITOR = config["Files"]["CompaniesToMonitor"] # Boolean value INITIAL_START = config["Files"]["InitialStart"] # Email/Password info EMAIL = config["Email-Info"]["Email"] PASSWORD = config["Email-Info"]["Password"]
torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(seedNum) np.random.seed(seedNum) file_name = [ params.task, params.model, params.hidden_dim, params.question_dim, params.lr, params.dropout, params.default_dim, params.valid_prob ] if params.dash: file_name.append(params.dash) file_name.append(params.bidirectional) if params.model == 'attn': file_name.append(params.head) if params.dash: answer_filename = 'data_task_1_2/answer_dash_metadata_task_1_2_extra.json' answer_meta = open_json(answer_filename) else: answer_meta = None train_data = open_json('data_task_1_2/data_1_2.json') for d in train_data: d['valid_mask'] = [ 0 if np.random.rand() < params.valid_prob and ds else 1 for ds in d['test_mask'] ] train_dataset = LSTMDataset(train_data, answer_meta=answer_meta) collate_fn = lstm_collate(is_dash=params.dash == 1) num_workers = 2 bs = params.batch_size train_loader = torch.utils.data.DataLoader(train_dataset,
type=str, default=random_flower, help='Path to image') parser.add_argument('--checkpoint', type=str, default='checkpoint.pth', help='Path to checkpoint') parser.add_argument('--topk', type=int, default=5, help='Top N Classes and Probabilities') parser.add_argument('--json', type=str, default='cat_to_name.json', help='class_to_name json file') parser.add_argument('--gpu', type=str, default='cuda', help='GPU or CPU') arg, unknown = parser.parse_known_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class_name = open_json(arg.json) model = load_checkpoint(arg.checkpoint) checkpoint = torch.load(arg.checkpoint) image = process_image(arg.image_dir) probs, classes = predict(random_flower, model) prediction_test(class_name, classes, probs, random_folder)
import discord import random from utils import get_value_from_json, open_json # Read secrets client_secret = get_value_from_json('ClientSecret', './secrets.json') client = discord.Client() # Read image dictionary image_dictionary = open_json('./image_dictionary.json') # Events @client.event async def on_ready(): print('Bot successfully logged in as: {0.user}'.format(client)) @client.event async def on_message(message): # Checks if user has typed one of the keywords in the image dictionary for key in image_dictionary: if (message.content.upper() == key): with open(image_dictionary[key], 'rb') as image: await message.channel.send(file=discord.File(image)) break # Coinflip simulation if (message.content.upper() == '!COINFLIP'):
file_name = [ params.model, params.hidden_dim, params.question_dim, params.lr, params.dropout, params.mix_active, params.concat_hidden_dim, params.concat_dim ] file_name = [str(d) for d in file_name] params.file_name = '_'.join(file_name) seedNum = 221 np.random.seed(seedNum) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(seedNum) np.random.seed(seedNum) random.seed(seedNum) question_meta = open_json('data_task_4/question_metadata_task_3_4.json') train_data_path = os.path.normpath('data_task_4/train_task_4.csv') valid_data_path = os.path.normpath('data_task_4/valid_task_4.csv') valid_df = pd.read_csv(valid_data_path) valid_data = pivot_df(valid_df, 'AnswerValue') #n_student, 948: 1 to 4 and -1 valid_binary_data = pivot_df(valid_df, 'IsCorrect') # n_student, 948: 1 to 0 and -1 train_df = pd.read_csv(train_data_path) # n_student, 948: 1 to 4 and -1 train_data = pivot_df(train_df, 'AnswerValue') # n_student, 948: 1 to 0 and -1 train_binary_data = pivot_df(train_df, 'IsCorrect') train_dataset = FFDataset(train_data, train_binary_data, question_meta) valid_dataset = FFDataset(valid_data, valid_binary_data, question_meta) num_workers = 3
import os import utils_arcgis_gender from bs4 import BeautifulSoup import json import copy from arcgis.gis import GIS # ==================================== # Upload to ArcGIS into staging folder # ==================================== release = '2021.01' # Layer info template layer_info = utils.open_json('utilities/layerinfo.json') # print(layer_info) layer_info_properties = list(layer_info.keys()) # minset_catalog minset_catalog = utils.open_json('master_data/minset_catalog.json') main_fields = utils.select_dict( utils.tsv2dictlist('master_data/ts_catalog_edited.csv'), {'main_statistics_field': '1'}) # print(main_fields[0]) column_aliases = utils.open_json('master_data/column_aliases_edited.json') # print(column_aliases[0])
def merge(): args = get_args_merge() jsons = [] for j in args["json_file"]: jsons.extend(utils.open_json(j)) utils.save_json(jsons, args["outfile"])