def test_loadFileArgsError(self): try: ujson.load("[]") except TypeError: pass else: assert False, "expected TypeError"
def load_cooc_dict(): global cw_dict, c_dict liblogger.info("load cooc dict") pxy_cache_file = cooc_dict_file + ".pxy.cache" py_cache_file = cooc_dict_file + ".py.cache" if using_cache and os.path.exists(pxy_cache_file) and os.path.exists(py_cache_file): cw_dict = json.load(open(pxy_cache_file)) c_dict = json.load(open(py_cache_file)) return cooc_dict = json.load(open(cooc_dict_file)) cw_dict = defaultdict(int) c_dict = defaultdict(int) for w in cooc_dict: #ctxs = [eval(ctx) for ctx in cooc_dict[w].keys()] for ctx in cooc_dict[w]: count = cooc_dict[w][ctx] cw = (w, ctx) count = cooc_dict[w][ctx] cw_dict[cw] += count c_dict[ctx] += count liblogger.info("norm cooc dict for P(x, y)") cw_sum = float(sum(cw_dict.values())) for cw in cw_dict: cw_dict[cw] = math.log(cw_dict[cw] / cw_sum) json.dump(cw_dict, open(pxy_cache_file, "w")) liblogger.info("ctx dict P(y)") c_sum = float(sum(c_dict.values())) for c in c_dict: c_dict[c] = math.log(c_dict[c] / c_sum) json.dump(c_dict, open(py_cache_file, "w"))
def test_orderbook(): variable_order_book = Book() control_order_book = Book() with open('testdata/messages.json') as messages_json_file: messages = json.load(messages_json_file) with open('testdata/beginning_level_3.json') as begin_json_file: beginning_level_3 = json.load(begin_json_file) with open('testdata/ending_level_3.json') as end_json_file: ending_level_3 = json.load(end_json_file) try: assert beginning_level_3['sequence'] + 1 == messages[0]['sequence'] assert ending_level_3['sequence'] == messages[-1]['sequence'] except AssertionError: print("Problem with sample data sequences") variable_order_book.get_level3(beginning_level_3) start = time.time() [variable_order_book.process_message(message) for message in messages] end = time.time() print('messages per sec: {0}'.format(int(len(messages)/(end-start)))) control_order_book.get_level3(ending_level_3) dict_compare(variable_order_book.asks.price_map, control_order_book.asks.price_map, price_map=True) dict_compare(variable_order_book.asks.order_map, control_order_book.asks.order_map, order_map=True)
def extract_json_data(self, filename, option): ''' Imports .json files from peeringdb and returns a list of dictionaries with all the retrieved IXP information. Input: a) filename: A .json file name. b) mypath: The directory path of the database. c) option: Flag to download the file. d) config: Dictionary that contains the config file. Ouput: a) A list of dictionaries. ''' try: with open(self.homepath + '/database' + filename) as data_file: obj = ujson.load(data_file) except: print(filename + ' was not found.') if not self.downloader.download_peering(option): print("Could not download " + filename + ". Copying from the default database.") try: copyfile(self.libpath + '/database/Default' + filename, self.homepath + '/database' + filename) except: print('Could not copy ' + filename + ' from the default database.') try: with open(self.homepath + '/database' + filename) as data_file: obj = ujson.load(data_file) except: print('Could not open ' + filename + '. Exiting.') exit(0) return (obj['data'])
def get_translation_percentage(self, locale_path: Text, locale: Text) -> int: # backend stats po = polib.pofile(self.get_po_filename(locale_path, locale)) not_translated = len(po.untranslated_entries()) total = len(po.translated_entries()) + not_translated # frontend stats with open(self.get_json_filename(locale_path, locale)) as reader: for key, value in ujson.load(reader).items(): total += 1 if value == '': not_translated += 1 # mobile stats with open(os.path.join(locale_path, 'mobile_info.json')) as mob: mobile_info = ujson.load(mob) try: info = mobile_info[locale] except KeyError: if self.strict: raise info = {'total': 0, 'not_translated': 0} total += info['total'] not_translated += info['not_translated'] return (total - not_translated) * 100 // total
def _load(logger, tests_root, manifest, types=None, meta_filters=None, allow_cached=True): # "manifest" is a path or file-like object. manifest_path = (manifest if isinstance(manifest, string_types) else manifest.name) if allow_cached and manifest_path in __load_cache: return __load_cache[manifest_path] if isinstance(manifest, string_types): if os.path.exists(manifest): logger.debug("Opening manifest at %s" % manifest) else: logger.debug("Creating new manifest at %s" % manifest) try: with open(manifest) as f: rv = Manifest.from_json(tests_root, fast_json.load(f), types=types, meta_filters=meta_filters) except IOError: return None except ValueError: logger.warning("%r may be corrupted", manifest) return None else: rv = Manifest.from_json(tests_root, fast_json.load(manifest), types=types, meta_filters=meta_filters) if allow_cached: __load_cache[manifest_path] = rv return rv
def setUp(self): with open("tests/data/square.geojson") as f: self.square_geojson = json.load(f) with open("tests/data/square.topojson") as f: self.square_topojson = json.load(f) with open("tests/data/multipolygons_spherical.geojson") as f: self.ref = json.load(f)
def reading_vqa_data(vqa_dir, section): ans = 'mscoco_%s2014_annotations.json' % section with (vqa_dir / ans).open() as file_: ans_data = json.load(file_) image_by_id = {} answers_by_id = {} for answer in ans_data['annotations']: image = str(answer['image_id']) mca = answer['multiple_choice_answer'] img = '0'*(12 - len(image)) + image s = '/data/%s/images' % section s = s + '/COCO_%s2014_' % section + img + '.jpg' image_by_id[answer['question_id']] = s answers_by_id[answer['question_id']] = mca filename = ('MultipleChoice_mscoco_' '%s2014_questions.json' % section) with (vqa_dir / filename).open() as file_: ques_data = json.load(file_) for question in ques_data['questions']: text = question['question'] ques_id = question['question_id'] options = question['multiple_choices'] image_path = image_by_id[ques_id] image = Image.open(image_path) if min(image.size) < IMAGE_SIZE: image_path = prev_image image_by_id[ques_id] = image_path else: if (answers_by_id[ques_id] == 'yes'): prev_image = image_path yield ques_id, image_by_id[ques_id], text, options, answers_by_id[ques_id]
def load_place_savers(user_dir): """ This function loads the following place saving parameters: 1. cur_hop - Current hop of collection algorithm 2. cur_user_list - List of users collented during current hop 3. next_user_list - List of users to collect on next hop 4. added_topics_for_cur_hop - Topics added from current hop (if relevant to sampling method) 5. unavailable_accounts - List of unavailable accounts 6. finished_users - Users that have already been collected :param user_dir: Directory where profile information is saved :return place_saver_obj: Python dictionary of forementioned fields """ # Load object try: jfid = open(os.path.join(user_dir, "place_saver_v1.txt")) place_saver_obj = ujson.load(jfid) jfid.close() except ValueError: jfid = open(os.path.join(user_dir, "place_saver_v2.txt")) place_saver_obj = ujson.load(jfid) jfid.close() except IOError: print "The object 'place_saver' does not exist, creating it now" place_saver_obj = {} # Make all necessary fields in case they don't already exist if "cur_user_list" not in place_saver_obj.keys(): place_saver_obj["cur_user_list"] = set([]) if "next_user_list" not in place_saver_obj.keys(): place_saver_obj["next_user_list"] = set([]) if "cur_hop" not in place_saver_obj.keys(): place_saver_obj["cur_hop"] = 0 if "added_topics_for_cur_hop" not in place_saver_obj.keys(): place_saver_obj["added_topics_for_cur_hop"] = set([]) if "unavailable_accounts" not in place_saver_obj.keys(): place_saver_obj["unavailable_accounts"] = set([]) if "finished_users" not in place_saver_obj.keys(): place_saver_obj["finished_users"] = {} jsons = filter(lambda k: re.match("userInfo_*", k), os.listdir(user_dir)) for jj in range(len(jsons)): if jj % 200 == 0: print "Check profile JSON {} of {}".format(jj + 1, len(jsons)) try: full_filename = os.path.join(user_dir, jsons[jj]) if os.path.getsize(full_filename) == 0: continue jfid = open(full_filename) profile = ujson.load(jfid) jfid.close() if profile["id"] in place_saver_obj["finished_users"].keys(): continue else: place_saver_obj["finished_users"][profile["id"]] = jsons[jj] except ValueError: continue # Ensure that all fields are set objects for kk in place_saver_obj.keys(): if (kk != "finished_users") and (kk != "cur_hop"): place_saver_obj[kk] = set(place_saver_obj[kk]) return place_saver_obj
def combine_dicts(): with open('title10to100000.json') as tag200, open('title100000plus.json') as tag1500: tag200dict = ujson.load(tag200) tag500dict = ujson.load(tag1500) newdict = dict(chain(tag200dict.items(), tag500dict.items())) with open('titletagwords.json', 'w') as write: ujson.dump(newdict, write)
def __init__(self, path, writer_queue=None): """Initialize using path to file and optional thread-safe queue. Queue is used for json serializable data to be written to file when self.write_queued() is called. If the file at 'path' doesn't exist it will be created. """ self.path = os.path.realpath(os.path.expanduser(path)) if not os.path.exists(self.path): print("Persistence file %s does not exist yet, creating it...") json.dump({}, open(self.path, 'w')) else: # check for json-ness try: json.load(open(self.path)) LOG.debug("Loaded existing persistence file %s.", os.path.relpath(self.path)) except ValueError as err: raise ValueError("The persistence file -> %s is not " "a valid json file. | %s" % (os.path.relpath(self.path), err)) if writer_queue and not isinstance(writer_queue, Queue.Queue): raise TypeError('writer_queue should be a Queue.Queue.') elif writer_queue: self.synq = writer_queue self.synq._persisted = set() else: self.synq = None
def addin_dubbed_video_mappings(node_data, lang=en_lang_code): # Get the dubbed videos from the spreadsheet and substitute them # for the video, and topic attributes of the returned data struct. build_path = os.path.join(os.getcwd(), "build") # Create a dubbed_video_mappings.json, at build folder. if os.path.exists(os.path.join(build_path, "dubbed_video_mappings.json")): logging.info("Dubbed videos json already exist at %s" % (DUBBED_VIDEOS_MAPPING_FILEPATH)) else: main() # Get the list of video ids from dubbed video mappings lang_code = get_lang_name(lang).lower() dubbed_videos_path = os.path.join(build_path, "dubbed_video_mappings.json") with open(dubbed_videos_path, "r") as f: dubbed_videos_load = ujson.load(f) dubbed_videos_list = dubbed_videos_load.get(lang_code) # If dubbed_videos_list is None It means that the language code is not available in dubbed video mappings. if not dubbed_videos_list: return node_data # Get the current youtube_ids, and topic_paths from the khan api node data. youtube_ids = [] topic_paths = [] for node in node_data: node_kind = node.get("kind") if node_kind == NodeType.video: youtube_ids.append(node.get("youtube_id")) if node_kind == NodeType.topic: topic_paths.append(node.get("path")) en_nodes_path = os.path.join(build_path, "en_nodes.json") with open(en_nodes_path, "r") as f: en_node_load = ujson.load(f) en_node_list = [] # The en_nodes.json must be the same data structure to node_data variable from khan api. for node in en_node_load: node_kind = node.get("kind") if node_kind == NodeType.video: youtube_id = node["youtube_id"] if not youtube_id in youtube_ids: if youtube_id in dubbed_videos_list: node["youtube_id"] = dubbed_videos_list[youtube_id] node["translated_youtube_lang"] = lang en_node_list.append(node) youtube_ids.append(youtube_id) # Append all topics that's not in topic_paths list. if node_kind == NodeType.topic: if not node["path"] in topic_paths: en_node_list.append(node) topic_paths.append(node["path"]) node_data += en_node_list return node_data
def main(): parser = argparse.ArgumentParser(description = "Analysis scripts for LexNorm in W-NUT 2015") parser.add_argument("--pred", required = True, help = "A JSON file: Your predictions over test data formatted in JSON as training data") parser.add_argument("--oracle", required = True, help = "A JSON file: The oracle annotations of test data formatted in JSON as training data") args = parser.parse_args() predicates = json.load(open(args.pred)) training_list = json.load(open(args.pred)) oov_detection_performance(training_list,predicates)
def LoadData(self): fp=gzip.open('data/dictbase/word_pos.txt.gz') self.word_pos=json.load(fp) fp.close() fp=gzip.open('data/dictbase/word_pos_max.txt.gz') self.word_pos_max=json.load(fp) fp.close() fp=gzip.open('data/dictbase/word_trans.txt.gz') self.word_tran=json.load(fp) fp.close()
def demo(config): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.test_meta, "r") as fh: meta = json.load(fh) model = Model(config, None, word_mat, char_mat, trainable=False, demo = True) demo = Demo(model, config)
def update_in_resources(alias, updates): if type(update) is not dict: return("Updates need to be specified as key:value pairs in a dictionnary. Process Aborted.") keys = updates.keys() values = updates.values() if keys not in ['alias','tag','title']: return '''The updates' dictionary do not have the right keys; they must all be in ['alias','tag','title']. Note: Do not include 'timestamp' when doing updates. Process Aborted''' if len(keys) is not len(values): return("Number of Keys and Values do not match. Process Aborted.") def helper(movie, keys, values): for k in range(len(keys)): movie[keys[k]] = values[k] movie['timestamp'] = datetime.datetime.now() return movie if 'resources.json' not in os.listdir('.'): return " The file 'resources.json' is not in the current working directory. Process Aborted." with open('resources.json') as json_file: resource = ujson.load(json_file) if is_in_resources(resource, alias) == False : return "Movie with alias is not in resource file. Movie must be added first." %(alias) else: movie = list(filter((lambda movie : movie['alias'] in alias), resource['movies'])) if len(movie) is not 1 : return("That's weird...multiple matches for alias given. Process Aborted.") else: updated = helper(movie[0], keys, values); del movie if 'alias' not in updated.keys(): return("Update has no 'alias' key. Process Aborted.") if 'tag' not in updated.keys(): return("Update has no 'tag' key. Process Aborted.") if 'title' not in updated.keys(): return("Update has no 'title' key. Process Aborted.") if 'timestamp' not in updated.keys(): return("Update has no 'timestamp' key. Process Aborted.") deleted = delete(alias) if deleted is not True : return deleted del deleted with open('resources.json') as json_file: resource = ujson.load(json_file) resource['movies']. append(updated) resource['logs'].append({ 'timestamp': datetime.datetime.now(), 'type': 'post', 'message': " '%s' with alias '%s' and tag '%s' was successfully added as an update." %(updated['title'], updated['alias'], updated['tag']) }) with open('resources.json', 'w') as outfile: ujson.dump(resource, outfile) return " '%s' with alias '%s' and tag '%s' was successfully added as an update." %(updated['title'], updated['alias'], updated['tag'])
def main(unused_argv): task = json.load(sys.stdin) json_path = os.path.join( os.path.dirname(__file__), '..', '..', 'solutions', 'state-of-the-art.json') with open(json_path) as f: solutions = json.load(f) for solution in solutions: if (solution['problemId'] == task['id'] and solution['seed'] == task['sourceSeeds'][0]): json.dump([solution], sys.stdout) sys.stdout.write('\n')
def update_unesco_regions(self): """ This code will create/update unesco regions and update the country -> region mapping """ import os import ujson from geodata.models import Region from iati.models import RegionVocabulary base = os.path.dirname(os.path.abspath(__file__)) location = base + '/data_backup/unesco_regions.json' json_data = open(location) unesco_regions = ujson.load(json_data) json_data.close() location_map = base + '/data_backup/unesco_country_region_mapping.json' json_data_map = open(location_map) unesco_mapping = ujson.load(json_data_map) json_data_map.close() #save regions and put in list regions = [] region_vocabulary = RegionVocabulary.objects.get_or_create( code=999, name='UNESCO')[0] for region_id, info in unesco_regions.items(): center_location_string = 'POINT(' + info['longitude'] + ' ' + info['latitude'] + ')' center_location = fromstr( center_location_string, srid=4326) region = Region.objects.get_or_create( code=region_id, defaults={ 'name': info['name'], 'region_vocabulary': region_vocabulary, 'parental_region': None, 'center_longlat': center_location})[0] regions.append(region) # save country -> region mapping for line in unesco_mapping: region_id = line["UNESCO Region Code"] country_id = line["Country ID"] country = Country.objects.get(code=country_id) for region in regions: if region.code == region_id: country.unesco_region = region country.save()
def load_tfidf(vocab_path, idf_weights_path): """Loads tfidf vectorizer from its components. :param str vocab_path: path to the vectorizer vocabulary JSON. :param str idf_weights_path: path to idf weights JSON. :rtype: sklearn.feature_extraction.text.TfidfVectorizer """ tfidf = TfidfVectorizer(analyzer=lambda x: x, vocabulary=json.load(open(vocab_path))) idf_vector = np.array(json.load(open(idf_weights_path))) tfidf._tfidf._idf_diag = scipy.sparse.diags([idf_vector], [0]) tfidf.vocabulary_ = tfidf.vocabulary return tfidf
def insert_classes(cursor): """ Fetch and insert the classes from classes.json :param cursor: :return: """ ranks = dict() with open(RANKS_PATH, encoding='UTF-8') as ranks_file: ranks_dict = ujson.load(ranks_file) for rank, ranked_archetypes in ranks_dict.items(): try: rank = int(rank.strip("Rank")) except ValueError: rank = MAX_RANK for ranked_classes in ranked_archetypes.values(): for ranked_class in ranked_classes: ranks[ranked_class] = rank with open(CLASSES_PATH, encoding='UTF-8') as classes_file: classes_dict = ujson.load(classes_file) classes = list() # Get list of sorted classes sorted_classes_ids = list() for class_id in classes_dict.keys(): if '_' in class_id: splited_class_id = class_id.split("_", 1) sorted_classes_ids.append((class_id, int(splited_class_id[0].strip("Char")), int(splited_class_id[-1]))) else: sorted_classes_ids.append((class_id, 0, 0)) sorted_classes_ids.sort(key=lambda tup: tup[2]) sorted_classes_ids.sort(key=lambda tup: tup[1]) # Start processing them for class_id, archetype, char_n in sorted_classes_ids: _class = classes_dict[class_id] class_info = list() # Get Class Name class_info.append(get_value(_class, "Class", "name", str)) # Get Class Archetype class_info.append(get_archetype_id(get_value(_class, "Class", "base", str))) # Get Rank class_info.append(ranks.get(class_id, 0)) # Get Icon class_info.append(format_icon(get_value(_class, "Class", "icon", str))) # Get Temp ID class_info.append(class_id) classes.append(tuple(class_info)) classes = tuple(classes) cursor.executemany("INSERT INTO classes (name, archetype, rank, icon, temp_id) VALUES (?, ?, ?, ?, ?)", classes)
def load_lex_counts(): global w_dict liblogger.info("load word dict") cache_file = lex_count_file + ".cache" if using_cache and os.path.exists(cache_file): w_dict = json.load(open(cache_file)) return lex_counts = json.load(open(lex_count_file)) w_sum = float(sum(lex_counts.values())) w_dict = dict() liblogger.info("norm word dict for P(x)") for w in lex_counts: w_dict[w] = math.log(lex_counts[w] / w_sum) json.dump(w_dict, open(cache_file, "w"))
def get_database_connection(): s3 = boto3.resource('s3') metasrcs = ujson.load( s3.Object('net-mozaws-prod-us-west-2-pipeline-metadata', 'sources.json').get()['Body']) creds = ujson.load( s3.Object('net-mozaws-prod-us-west-2-pipeline-metadata', '%s/write/credentials.json' % ( metasrcs['distribution-viewer-db']['metadata_prefix'], )).get()['Body']) conn = psycopg2.connect(host=creds['host'], port=creds['port'], user=creds['username'], password=creds['password'], dbname=creds['db_name']) return conn, conn.cursor()
def scan_trace(dir, channels, sample_cb): manifest_fn = path.join(dir, 'manifest.json') with open(manifest_fn, 'rb') as f: manifest = ujson.load(f) if 0: print manifest begin_ts = manifest['beginTs'] end_ts = manifest['endTs'] print 'range', begin_ts, end_ts timeseq_infos = manifest['timeseqInfos'] cur_samples = {} # There's some thought to saving memory here. Only load a chunk (of all timeseqs) at a time for chunk in get_chunks(begin_ts, end_ts): all_chunk_data = {} all_chunk_indexes = {} for tsi in timeseq_infos: ts_name = tsi['name']; if ('*' not in channels) and (ts_name not in channels): continue chunk_fn = path.join(dir, 'chunk_%s_%d.json.gz' % (ts_name, chunk)) try: with gzip.open(chunk_fn, 'rb') as f: print 'Reading', chunk_fn, '...' all_chunk_data[ts_name] = ujson.load(f) all_chunk_indexes[ts_name] = 0 except: print chunk_fn, 'not found' if ts_name in all_chunk_data: del all_chunk_data[ts_name] del all_chunk_indexes[ts_name] while True: # Find the next item to change (smallest timestamp) cur_ts = end_ts + 1 for ts_name in all_chunk_data: if all_chunk_indexes[ts_name] < len(all_chunk_data[ts_name]['times']): ts1 = all_chunk_data[ts_name]['times'][all_chunk_indexes[ts_name]] cur_ts = min(cur_ts, ts1) if cur_ts > end_ts: # Didn't find anything break # Now update cur_samples with all samples with matching timestamps for ts_name in all_chunk_data: if all_chunk_indexes[ts_name] < len(all_chunk_data[ts_name]['times']): ts1 = all_chunk_data[ts_name]['times'][all_chunk_indexes[ts_name]] if ts1 == cur_ts: cur_samples[ts_name] = all_chunk_data[ts_name]['samples'][all_chunk_indexes[ts_name]] all_chunk_indexes[ts_name] += 1 # We copy so that customers can keep a copy that works after we mutate cur_samples again # scan_trace_deltat does this sample_cb(cur_ts, copy.copy(cur_samples))
def initialize(port, suffix): global PORT, FILTER_INFER, FILTER_APPLY, REPORT_REACTIONS, REACTION_CATEGORIES, REPORT_REACTIONS_SET PORT = port FILTER_INFER = json.load(open("../data/filter_infer_%s.json" % suffix)) FILTER_APPLY = json.load(open("../data/filter_apply_%s.json" % suffix)) REPORT_REACTIONS = json.load(open("../data/filter_report_%s.json" % suffix)) REPORT_REACTIONS = [(x, y) for x, y in REPORT_REACTIONS if isinstance(y, list)] REACTION_CATEGORIES = defaultdict(list) REPORT_REACTIONS_SET = {} all_rxn_ids = set.union(*[set(y) for x, y in REPORT_REACTIONS]) for category, reactions in REPORT_REACTIONS: REPORT_REACTIONS_SET[category] = set(reactions) REPORT_REACTIONS_SET["^%s" % category] = all_rxn_ids.difference(REPORT_REACTIONS_SET[category]) for reaction in reactions: REACTION_CATEGORIES[long(reaction)].append(category)
def loadVariables(self, infile, test): '''semi-stable variables which are not project specific''' self.logFilename = u'¤WLMStats.log' self.heritage_siteurl = 'https://tools.wmflabs.org/heritage/api' self.commons_siteurl = 'https://commons.wikimedia.org' self.gcmlimit = 250 #Images to process per API request in ImageInfo self.output = "output/" self.settings_file = infile self._test_gcmlimit = 5 self._test_limit = 15 #distingusih testdata if test: self.output += u'test.' self.settings_file = u'indata/settings.test.json' #load settings file requiredKeys = ['types', 'cats', 'date', 'identifier'] #keys which are explicitly called later try: f = codecs.open(self.settings_file, 'r', 'utf-8') self.settings = ujson.load(f) f.close() if not set(requiredKeys).issubset(set(self.settings.keys())) : raise KeyError("missing one of the required keys!: %s" %', '.join(requiredKeys)) except IOError, e: return u'Error opening settings file: %s' %e exit(1)
def get_best(args): with open(os.path.join(args.path, 'config.json')) as f: save_every = json.load(f)['save_every'] with open(os.path.join(args.path, 'process_0.log')) as f: lines = f.readlines() best_score = 0 best_it = 0 deca_scores = {} for l in lines: if 'val' in l: try: task = l.split('val_')[1].split(':')[0] except Exception as e: print(e) continue it = int(l.split('iteration_')[1].split(':')[0]) metric = args.task_to_metric[task] score = float(l.split(metric+'_')[1].split(':')[0]) if it in deca_scores: deca_scores[it]['deca'] += score deca_scores[it][metric] = score else: deca_scores[it] = {'deca': score, metric: score} if deca_scores[it]['deca'] > best_score: best_score = deca_scores[it]['deca'] best_it = it print(best_it) print(best_score) return os.path.join(args.path, f'iteration_{int(best_it)}.pth')
def build_most_improved(): # We want to make most improved ready for that table in charts # and not have to run around to fetch it. global simple_analysis analysis_improved = simple_analysis["improved"] = [] with open("jsondb/schools/districts.json") as f: districts = ujson.load(f) for id, distance in mostimproved: real_id = id.split("-")[0] school_meta = school_metas[real_id] school_grade = school_grades[id] analysis_improved.append( ( id, school_meta["name"].title(), districts[school_meta["district"]] if school_meta["district"] else None, school_meta["city"].title(), school_meta.get("enrollment", {}).get("2010", {}).get("total"), school_meta.get("enrollment", {}).get("2012", {}).get("total"), school_grade.get("2010", {}).get("rank"), school_grade.get("2012", {}).get("rank"), distance, ) )
def __init__(self, input='content', charset='utf-8', charset_error='strict', strip_accents=None, vocabulary=None, normalize=True, dtype=float): self.input = input self.charset = charset self.charset_error = charset_error self.strip_accents = strip_accents if vocabulary is not None: self.fixed_vocabulary = True if not isinstance(vocabulary, Mapping): vocabulary = dict((t, i) for i, t in enumerate(vocabulary)) self.vocabulary_ = vocabulary else: self.fixed_vocabulary = False try: self.poscache = json.load(open(poscache_filename, "r")) except IOError: self.poscache = {} self.normalize = normalize self.dtype = dtype
def get_ap_file(self, path): """ Get raw data file. """ with open(path, 'r') as readfile: data = json.load(readfile) return data['trendtable']
def update_unesco_sectors(self): base = os.path.dirname(os.path.abspath(__file__)) location = base + "/data_backup/unesco_sectors.json" json_data = open(location) unesco_sectors = ujson.load(json_data) for cr in unesco_sectors: try: code = int(cr) name = unesco_sectors[cr]['name'] if Sector.objects.filter(code=code).exists(): the_sector = Sector.objects.get(code=code) the_sector.name = name else: the_sector = Sector(code=code, name=name) the_sector.save() except Exception as e: print "error in update_country_sectors" + str(type) print e.args return False json_data.close() return True
def main(args): if args.debug: logging.basicConfig(level=logging.DEBUG) measurements = [] for filename in glob.iglob( os.path.join(args.measurements_dir, args.domain, "*", "*")): with open(filename) as file: measurements.append(json.load(file)) as_repo = sas.create_default_as_repo() classifier = DnsResolutionClassifier() control_resolutions = get_control_resolutions(measurements) for resolution in control_resolutions: classifier.add_good_resolution(resolution) print("\nCONTROL") for resolution, count in count_resolutions( control_resolutions).most_common(): print("%s -> %s: %d" % (resolution[0], resolution[1], count)) dns_resolutions = get_dns_results(as_repo, measurements) show_resolutions_graph(as_repo, args.domain, control_resolutions, dns_resolutions) print("\nTESTS") classified_resolutions = zip( dns_resolutions, classifier.classify_resolutions(dns_resolutions)) for country_code, country_classifications in group_by( classified_resolutions, lambda e: e[0].country).items(): try: country_name = iso3166.countries.get(country_code).name except KeyError: country_name = "Unknown" print("\n=============\n= %s (%s)\n=============" % (country_name, country_code)) country_count = len(country_classifications) grouped_country_classifications = group_by(country_classifications, lambda e: e[1]) for classification, entries in grouped_country_classifications.items(): class_count = len(entries) prefix = "All " if class_count == country_count else "" print(" %s%s: %d/%d" % (prefix, classification.name.lower(), class_count, country_count)) #if len(grouped_country_classifications[DnsResolutionClassification.FREE]) == country_count: # continue print("\n By Resolver:") for resolver_key, resolver_classifications in group_by( country_classifications, lambda e: make_resolver_key(as_repo, e[0])).items(): print(" - %s:" % resolver_key) resolver_count = len(resolver_classifications) for classification, entries in group_by(resolver_classifications, lambda e: e[1]).items(): class_count = len(entries) prefix = "All " if class_count == resolver_count else "" print(" %s%s: %d/%d" % (prefix, classification.name.lower(), class_count, resolver_count)) for classification, entries in grouped_country_classifications.items(): if classification == DnsResolutionClassification.EMPTY or not entries: continue print("\n %s resolutions:" % classification.name) displayed = set() for resolution, _ in entries: display_str = ",\n ".join([ "%s (%s)" % (resolve_ip(ip) or ip, as_str(as_repo.get_as_for_ip(ip))) for ip in sorted(resolution.ips) ]) if display_str in displayed: continue print(" - [%s] %s\n => %s" % (display_str, resolution.url.geturl(), path_get(resolution.measurement, ["test_keys", "requests", "failure"]))) displayed.add(display_str)
#从没做过触摸校准 if TOUCH_CALI_FILE not in uos.listdir(): touch = xpt2046( cs=TOUCH_CS, transpose=TFT_IS_PORTRAIT, ) from touch_cali import TouchCali touch_cali = TouchCali(touch, TOUCH_CALI_FILE) touch_cali.start() #已经做过触摸校准,直接调用触摸参数文件 else: with open(TOUCH_CALI_FILE, 'r') as f: param = ujson.load(f) touch_x0 = param['cal_x0'] touch_x1 = param['cal_x1'] touch_y0 = param['cal_y0'] touch_y1 = param['cal_y1'] touch = xpt2046( cs=TOUCH_CS, transpose=TFT_IS_PORTRAIT, cal_x0=touch_x0, cal_x1=touch_x1, cal_y0=touch_y0, cal_y1=touch_y1, ) TOUCH_READY = 1 #表示已经配置好触摸参数
sv = Service('gacha') jewel_limit = DailyNumberLimiter(6000) tenjo_limit = DailyNumberLimiter(1) GACHA_DISABLE_NOTICE = '本群转蛋功能已禁用\n如欲开启,请与维护组联系' JEWEL_EXCEED_NOTICE = f'您今天已经抽过{jewel_limit.max}钻了,欢迎明早5点后再来!' TENJO_EXCEED_NOTICE = f'您今天已经抽过{tenjo_limit.max}张天井券了,欢迎明早5点后再来!' SWITCH_POOL_TIP = 'β>发送"选择卡池"可切换' POOL = ('MIX', 'JP', 'TW', 'BL') DEFAULT_POOL = POOL[0] _pool_config_file = os.path.expanduser('~/.hoshino/group_pool_config.json') _group_pool = {} try: with open(_pool_config_file, encoding='utf8') as f: _group_pool = json.load(f) except FileNotFoundError as e: sv.logger.warning( 'group_pool_config.json not found, will create when needed.') _group_pool = defaultdict(lambda: DEFAULT_POOL, _group_pool) def dump_pool_config(): with open(_pool_config_file, 'w', encoding='utf8') as f: json.dump(_group_pool, f, ensure_ascii=False) gacha_10_aliases = ('抽十连', '十连', '十连!', '十连抽', '来个十连', '来发十连', '来次十连', '抽个十连', '抽发十连', '抽次十连', '十连扭蛋', '扭蛋十连', '10连', '10连!', '10连抽', '来个10连', '来发10连', '来次10连', '抽个10连', '抽发10连', '抽次10连', '10连扭蛋', '扭蛋10连', '十連', '十連!', '十連抽', '來個十連', '來發十連',
def __init__(self): if not os.path.exists(PATH): with open(PATH, "w") as f_x: ujson.dump({}, f_x) with open(PATH) as yt_db: self.db = ujson.load(yt_db)
for recall_index, recall_item in enumerate(recall_list): data = [recall_item, 100 - recall_index] temp.append(data) temp_json = ujson.dumps(temp_dict) # break client.hset('GraphEm_{0}'.format(clk_cid), 'ge_main_cold', temp_json) client.expire('GraphEm_{0}'.format(clk_cid), 3600 * 48) if __name__ == '__main__': #加载用户点击历史字典 load_start = time.time() with open('../data/get_user_click_history/click_his/click_his.json', 'r') as f: user_clk_his = ujson.load(f) print '加载点击历史字典', time.time() - load_start #加载相似item字典 load_start = time.time() with open('../data/get_similar_item/cold_sim/cold_sim_nid', 'r') as f: sim_item_ge = ujson.load(f) print '加载相似用户字典', time.time() - load_start print '所有用户个数', len(user_clk_his) t2 = time.time() pool = Pool(30) #遍历所有用户 for cid_index, clk_cid in enumerate(user_clk_his):
int(color_rgb[0] * 255), int(color_rgb[2] * 255)) else: leds[index * 7 + seg] = (0, 0, 0) pin = Pin(5, Pin.OUT) np = NeoPixel(pin, 28) np.fill((0, 0, 0, 0)) np.write() adc = ADC(0) f = open('config.json') config = ujson.load(f) f.close() if "blynk_server" in config and "blynk_port" in config: blynk = blynklib.Blynk(config["blynk_key"], server=config["blynk_server"], port=int(config["blynk_port"]), log=print) else: blynk = blynklib.Blynk(config["blynk_key"], log=print) @blynk.handle_event("connect") def connect_handler(): blynk.internal("rtc", "sync") print("sent rtc sync request to blynk server")
def experiment(model_name, df, indices, ratio, round, divide_fun, n_jobs, res_dir, compute_conf_score): ''' Single round of the experiment for a determined ratio. The model is instantied, trained, tested against a dataset and results are stored :param model_name: The name of the model :param df: pandas dataframe containing the feature vector for all the samples :param indices: store indices for four sets of packed_benign, unpacked_benign, packed_malicious, unpacked_malicious :param ratio: Tuples of ratio of benign / malicious packed :param round: counter of rounds :param divide_fun: function to divide the dataset, defined in config :rtype Dictionary ''' id = '{}-{}-{}'.format(ratio[0], ratio[1], round) dprint('Entered experiment:', id) ratio_ben, ratio_mal = ratio # split between test, train training_packed_benign, testing_packed_benign, training_unpacked_benign, testing_unpacked_benign, training_packed_malicious, testing_packed_malicious, training_unpacked_malicious, testing_unpacked_malicious = divide_fun( indices, ratio_ben, ratio_mal, NUMPY_SEED + round) # training_packed_benign, testing_packed_benign, training_unpacked_benign, testing_unpacked_benign, training_packed_malicious, testing_packed_malicious, training_unpacked_malicious, testing_unpacked_malicious = divide_fun(df, ratio_ben, ratio_mal, NUMPY_SEED+round) # dprint('dividing dataset') train_indices = training_packed_malicious + training_packed_benign + training_unpacked_malicious + training_unpacked_benign test_indices = testing_packed_malicious + testing_packed_benign + testing_unpacked_malicious + testing_unpacked_benign verify_test_train_separated(train_indices, test_indices) # it means, to scale up, we need to work only on good features. # We get them from the RF classifier good_features = None if model_name == 'svc' or model_name == 'lsvm': with open( '{}/features-{}-{}.json'.format( res_dir.replace(model_name, "rf"), ratio_ben, ratio_mal), 'r') as f: rf_res = json.load(f) rf_features = rf_res['features'] rf_weights = rf_res['weights'] num = 10000 good_features = [ f for _, f in sorted(zip(rf_weights, rf_features), reverse=True)[:num] ] print("Only top {} features from RF considered for trainning SVM". format(num)) # rf_weights = [w for w, _ in sorted(zip(rf_weights, rf_features), reverse=True)[:num]] df = df[good_features + [c for c in drop_columns if c in df.columns]] # df = normalize(model_name, df) x_train = df[df.index.isin(train_indices)] dprint('done with dividing') # labels are being malicious or benign y_train = np.asarray(x_train['malicious'].values) # remove labels related to packing and type of binary x_train = x_train.drop(columns=drop_columns, axis=1, errors='ignore') # train model on training set model = get_model(model_name, n_jobs) dprint('Doing training', id) dprint("training size: {}".format(len(x_train))) model.fit(x_train, y_train) # importance_result = None if round == 0: weights = get_features_importances(model_name, model) if weights is not None: importances = (json.dumps(list(x_train.columns)), json.dumps(weights)) dprint('Got importances', id) with open( '{}/features-{}-{}.json'.format(res_dir, ratio_ben, ratio_mal), 'w') as f: json.dump( { "weights": weights, "features": list(x_train.columns) }, f) else: importances = (json.dumps([]), json.dumps([])) joblib.dump( model, '{}/model-{}-{}.joblib'.format(res_dir, ratio_ben, ratio_mal)) else: importances = None # temporarily store the size of the sets used stats = { 'ratio_ben': ratio_ben * 100, 'ratio_mal': ratio_mal * 100, 'training_packed_malicious': len(training_packed_malicious), 'training_unpacked_benign': len(training_unpacked_benign), 'training_packed_benign': len(training_packed_benign), 'training_unpacked_malicious': len(training_unpacked_malicious), 'testing_unpacked_malicious': len(testing_unpacked_malicious), 'testing_packed_benign': len(testing_packed_benign), 'testing_unpacked_benign': len(testing_unpacked_benign), 'testing_packed_malicious': len(testing_packed_malicious) } dprint(stats) # evaluating on a dataset with same ratio as training dataset # print("evaluating on the test dataset with the same ratio as the training dataset") packed_test = df[df.index.isin(testing_packed_benign + testing_packed_malicious)] unpacked_test = df[df.index.isin(testing_unpacked_benign + testing_unpacked_malicious)] test = (packed_test, unpacked_test) if round == 0 and compute_conf_score: results, conf = evaluate(model_name, model, test, stats, do_conf_score=True) else: results, conf = evaluate(model_name, model, test, stats, do_conf_score=False) dprint('Done evaluating, returning:', id) return { 'results': results, 'confidence': conf, 'importances': importances, 'model': model }
def test_sber_onfly(config): print('Loading emb matrices') with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.bpe_emb_file, "r") as fh: bpe_mat = np.array(json.load(fh), dtype=np.float32) with open(config.pos_emb_file, "r") as fh: pos_mat = np.array(json.load(fh), dtype=np.float32) if config.use_bpe and config.use_bpe_pretrained_codes: bpe_model = BPE(open(config.bpe_pretrained_codes_file, 'r')) elif config.use_bpe and not config.use_bpe_pretrained_codes: bpe_model = BPE(open(config.bpe_codes_file, 'r')) else: bpe_model = None word2idx_dict = pickle.load(open(config.word2idx_dict_file, 'rb')) char2idx_dict = pickle.load(open(config.char2idx_dict_file, 'rb')) bpe2idx_dict = pickle.load(open(config.bpe2idx_dict_file, 'rb')) pos2idx_dict = pickle.load(open(config.pos2idx_dict_file, 'rb')) print("Loading model...") model = Model(config, None, word_mat, char_mat, bpe_mat, pos_mat, trainable=False, use_tfdata=False) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if config.model_name == 'latest': checkpoint = tf.train.latest_checkpoint(config.save_dir) else: checkpoint = os.path.join(config.save_dir, config.model_name) print('Restoring from: {}'.format(checkpoint)) saver.restore(sess, checkpoint) sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) for datafile, datatype in zip( [config.sber_public_file, config.sber_private_file], ['public', 'private']): datafile_squad = os.path.join(config.target_dir, "{}.json_squad".format(datatype)) sber2squad(datafile, outfile=datafile_squad) data_examples, data_eval = process_file( config, datafile_squad, datatype, remove_unicode=config.remove_unicode, bpe_model=bpe_model, is_test=True) data_features, data_meta = build_features_notfdata(config, data_examples, datatype, word2idx_dict, char2idx_dict, bpe2idx_dict, pos2idx_dict, is_test=True) total = data_meta["total"] answer_dict = {} remapped_dict = {} print(len(data_features)) # hotfix добить длину data_examples до делителя config.batch_size while len(data_features) % config.batch_size != 0: data_features.append(data_features[-1]) print(len(data_features)) for step in tqdm(range(total // config.batch_size + 1)): def get_batch(): batch_items = data_features[step * config.batch_size:(step + 1) * config.batch_size] batch = dict() for key in batch_items[0].keys(): batch[key] = np.stack([el[key] for el in batch_items]) return batch batch = get_batch() qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2], feed_dict={ model.c_ph: batch['context_idxs'], model.q_ph: batch['ques_idxs'], model.ch_ph: batch['context_char_idxs'], model.qh_ph: batch['ques_char_idxs'], model.cb_ph: batch['context_bpe_idxs'], model.qb_ph: batch['ques_bpe_idxs'], model.cp_ph: batch['context_pos_idxs'], model.qp_ph: batch['ques_pos_idxs'], model.y1_ph: batch['y1'], model.y2_ph: batch['y2'], model.qa_id: batch['id'], }) answer_dict_, remapped_dict_ = convert_tokens( data_eval, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) path_to_save_answer = os.path.join( config.answer_dir, '{}.json_squad_ans'.format(datatype)) with open(path_to_save_answer, "w") as fh: json.dump(remapped_dict, fh) sber_ans = '.'.join(path_to_save_answer.split('.')[0:-1]) + '.json_ans' squad_answer2sber(datafile, path_to_save_answer, outfile=sber_ans) print("Answer dumped: {}".format(path_to_save_answer)) # evaluating # TODO: CHANGE TO ENG URL url = 'http://api.aibotbench.com/rusquad/qas' headers = {'Content-Type': 'application/json', 'Accept': 'text/plain'} metrics = dict() f1, em = 0.0, 0.0 for datatype in ['public', 'private']: sber_ans = open( os.path.join(config.answer_dir, '{}.json_ans'.format(datatype)), 'r').readline() res = requests.post(url, data=sber_ans, headers=headers) metrics[datatype] = eval(json.loads(res.text)) f1 += metrics[datatype]['f1'] em += metrics[datatype]['exact_match'] print('{}: EM: {:.5f} F-1: {:.5f}'.format( datatype, metrics[datatype]['exact_match'], metrics[datatype]['f1'])) print('EM avg: {:.5f} F-1 avg: {:.5f}'.format(em / 2, f1 / 2))
def run_lighthouse_test(self, task): """Run a lighthouse test against the current browser session""" task['lighthouse_log'] = '' if 'url' in self.job and self.job['url'] is not None: self.job['shaper'].configure(self.job) output_path = os.path.join(task['dir'], 'lighthouse.json') json_file = os.path.join(task['dir'], 'lighthouse.report.json') json_gzip = os.path.join(task['dir'], 'lighthouse.json.gz') html_file = os.path.join(task['dir'], 'lighthouse.report.html') html_gzip = os.path.join(task['dir'], 'lighthouse.html.gz') time_limit = min(int(task['time_limit']), 80) command = [ 'lighthouse', '--disable-network-throttling', '--disable-cpu-throttling', '--enable-error-reporting', '--max-wait-for-load', str(int(time_limit * 1000)), '--port', str(task['port']), '--output', 'html', '--output', 'json', '--output-path', '"{0}"'.format(output_path) ] if self.job['keep_lighthouse_trace']: command.append('--save-assets') if self.options.android or 'mobile' not in self.job or not self.job[ 'mobile']: command.append('--disable-device-emulation') command.append('"{0}"'.format(self.job['url'])) cmd = ' '.join(command) self.lighthouse_command = cmd # Give lighthouse up to 10 minutes to run all of the audits try: lh_thread = threading.Thread(target=self.lighthouse_thread) lh_thread.start() lh_thread.join(600) except Exception: pass from .os_util import kill_all kill_all('node', True) self.job['shaper'].reset() # Rename and compress the trace file, delete the other assets if self.job['keep_lighthouse_trace']: try: lh_trace_src = os.path.join(task['dir'], 'lighthouse-0.trace.json') if os.path.isfile(lh_trace_src): # read the JSON in and re-write it line by line to match the other traces with open(lh_trace_src, 'rb') as f_in: trace = json.load(f_in) if trace is not None and 'traceEvents' in trace: lighthouse_trace = os.path.join( task['dir'], 'lighthouse_trace.json.gz') with gzip.open(lighthouse_trace, 'wb', 7) as f_out: f_out.write('{"traceEvents":[{}') for trace_event in trace['traceEvents']: f_out.write(",\n") f_out.write(json.dumps(trace_event)) f_out.write("\n]}") except Exception: pass # Delete all the left-over lighthouse assets files = glob.glob(os.path.join(task['dir'], 'lighthouse-*')) for file_path in files: try: os.remove(file_path) except Exception: pass if os.path.isfile(json_file): # Remove the raw screenshots if they were stored with the file lh_report = None with open(json_file, 'rb') as f_in: lh_report = json.load(f_in) if lh_report is not None and 'audits' in lh_report and \ 'screenshots' in lh_report['audits']: del lh_report['audits']['screenshots'] with gzip.open(json_gzip, 'wb', 7) as f_out: json.dump(lh_report, f_out) else: with open(json_file, 'rb') as f_in: with gzip.open(json_gzip, 'wb', 7) as f_out: shutil.copyfileobj(f_in, f_out) try: os.remove(json_file) except Exception: pass if os.path.isfile(html_file): # Remove the raw screenshots if they were stored with the file with open(html_file, 'rb') as f_in: lh_report = f_in.read() start = lh_report.find('\n "screenshots') if start >= 0: end = lh_report.find('\n },', start) if end >= 0: lh_report = lh_report[:start] + lh_report[end + 7:] with gzip.open(html_gzip, 'wb', 7) as f_out: f_out.write(lh_report) try: os.remove(html_file) except Exception: pass
def main(): gl_start = time.time() multiprocessing.set_start_method("spawn") args = get_arg_parser().parse_args() print(ujson.dumps(vars(args), indent=4)) random.seed(args.seed) args.out_data_dir = os.path.join(args.out_dir, args.subfolder) if os.path.exists(args.out_data_dir): print(f"Removing {args.out_data_dir}") shutil.rmtree(args.out_data_dir) os.makedirs(args.out_data_dir) # Final step is to format data for the views in _magpie #============================================== # DUMP RESULTS #============================================== mention_dump_dir = os.path.join( args.out_dir, f"_saved_mention_extractor_{os.path.splitext(os.path.basename(args.alias2cands))[0]}" ) print(f"Loading qid2title from {args.qid2title}") with open(args.qid2title) as in_f: qid2title = ujson.load(in_f) if not os.path.exists(mention_dump_dir) or args.overwrite: os.makedirs(mention_dump_dir, exist_ok=True) print(f"Building mention extractor for {mention_dump_dir}") mention_extractor = MentionExtractor(max_alias_len=5, max_candidates=27, alias2qids=args.alias2cands, qid2title=qid2title) mention_extractor.dump(mention_dump_dir) mention_extractor = MentionExtractor.load(mention_dump_dir) print(f"Loading qid2desc from {args.qid2desc}") with open(args.qid2desc) as in_f: qid2desc = ujson.load(in_f) # Loading up sentences print(f"Loading data from {args.data_dir}...") files = glob.glob(f"{args.data_dir}/*.jsonl") if len(files) <= 0: print(f"Didn't find any files at {args.data_dir}") return print(f"Found {len(files)} files") all_sentences = [] for f in files: with open(f) as in_f: for line in in_f: doc = ujson.loads(line) for sent in doc["sentences"]: sent["doc_qid"] = doc["qid"] sent["doc_title"] = doc["title"] new_aliases, new_spans, new_qids = [], [], [] for i in range(len(sent["aliases"])): if sent["label_type"][ i] != "Pronoun" and mention_extractor.does_alias_exist( sent["aliases"][i]): new_aliases.append(sent["aliases"][i]) new_spans.append(sent["spans"][i]) new_qids.append(sent["qids"][i]) if len(new_aliases) > 0: sent["aliases"] = new_aliases sent["qids"] = new_qids sent["spans"] = new_spans all_sentences.append(sent) print(f"Extracted {len(all_sentences)} sentences") dump_data(args, mention_dump_dir, qid2title, qid2desc, all_sentences) print( f"Finished in {time.time()-gl_start}s. Data saved in {os.path.join(args.out_data_dir, '04_trials_gold.js')}" )
#if (len(current_actor_role_synonyms) > 95): # print(len(current_actor_role_synonyms)) text_to_json = ujson.dumps({ "value": current_actor_name, "synonyms": current_actor_role_synonyms }) # Changing the text into json actor_identities['items'].append( ujson.decode(text_to_json)) # Append the synonyms to the list pbar.update(progress_iterator + 1) # Display incremented progress progress_iterator += 1 # Iterate the progress bar for next iteration pbar.finish() #Once we've complete the scraping, end the progress bar. return actor_identities['items'] if __name__ == '__main__': with open('popular_people.json') as data_file: actor_json_data = ujson.load(data_file) # Load actor data in formatted_json = format_json( actor_json_data) # Where the majority of the magic happens wrapped_json = ujson.decode( "[{\"entries\":" + ujson.encode(formatted_json) + ", \"name\": \"actors\"}]" ) # Wrapping the JSON with dialogflow's preferred formatting write_json_to_disk(wrapped_json)
def read_room_data(data_dir: str) -> List[ZerverFieldsT]: fn = 'rooms.json' data_file = os.path.join(data_dir, fn) with open(data_file) as f: data = ujson.load(f) return data
def load_cand_map(entity_mapping_dir, alias_map_file): return ujson.load(open(os.path.join(entity_mapping_dir, alias_map_file)))
headers = {"Authorization": self.config["dbl_token"]} url = "https://top.gg/api/bots/%d/stats" % self.user.id async with self.session.post(url, json=payload, headers=headers) as resp: # nopep8 try: data = await resp.json() log.info("Recieved %s %s %d %s", resp.method, resp._url, resp.status, data) except (TypeError, ValueError): log.info("Recieved %s %s %d", resp.method, resp._url, resp.status) async def close(self): log.debug("close() got called, cleaning up tasks") try: await self.session.close() except (RuntimeError, AttributeError): pass await super().close() if __name__ == "__main__": with open("config.json") as file: configuration = json.load(file) botsaber = botsaber(config=configuration) if configuration["debug_mode"] is True: botsaber.run(configuration["dev_token"]) else: botsaber.run(configuration["bot_token"])
def load_title_map(entity_mapping_dir): return ujson.load(open(os.path.join(entity_mapping_dir, 'qid2title.json')))
def load(self, filename=None): """Load file Parameters ---------- filename : str, optional File path Default value filename given to class constructor Raises ------ ImportError: Error if file format specific module cannot be imported IOError: File does not exists or has unknown file format Returns ------- self """ if filename: self.filename = filename self.format = self.detect_file_format(self.filename) dict.clear(self) if self.exists(): if self.format == 'yaml': try: import yaml except ImportError: message = '{name}: Unable to import YAML module.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ImportError(message) try: with open(self.filename, 'r') as infile: dict.update(self, yaml.load(infile)) except yaml.YAMLError as exc: self.logger.error("Error while parsing YAML file [%s]" % self.filename) if hasattr(exc, 'problem_mark'): if exc.context is not None: self.logger.error( str(exc.problem_mark) + '\n ' + str(exc.problem) + ' ' + str(exc.context)) self.logger.error( ' Please correct data and retry.') else: self.logger.error( str(exc.problem_mark) + '\n ' + str(exc.problem)) self.logger.error( ' Please correct data and retry.') else: self.logger.error( "Something went wrong while parsing yaml file [%s]" % self.filename) return elif self.format == 'cpickle': try: import cPickle as pickle except ImportError: try: import pickle except ImportError: message = '{name}: Unable to import pickle module.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ImportError(message) dict.update(self, pickle.load(open(self.filename, "rb"))) elif self.format == 'marshal': try: import marshal except ImportError: message = '{name}: Unable to import marshal module.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ImportError(message) dict.update(self, marshal.load(open(self.filename, "rb"))) elif self.format == 'msgpack': try: import msgpack except ImportError: message = '{name}: Unable to import msgpack module.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ImportError(message) dict.update(self, msgpack.load(open(self.filename, "rb"))) elif self.format == 'json': try: import ujson as json except ImportError: try: import json except ImportError: message = '{name}: Unable to import json module.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ImportError(message) dict.update(self, json.load(open(self.filename, "r"))) elif self.format == 'txt': with open(self.filename, 'r') as f: lines = f.readlines() dict.update(self, dict(zip(range(0, len(lines)), lines))) else: message = '{name}: Unknown format [{format}]'.format( name=self.__class__.__name__, format=self.filename) self.logger.exception(message) raise IOError(message) else: message = '{name}: File does not exists [{file}]'.format( name=self.__class__.__name__, file=self.filename) self.logger.exception(message) raise IOError(message) return self
async def ytdl_callback(c_q: CallbackQuery): choosen_btn = c_q.matches[0].group(1) data_key = c_q.matches[0].group(2) page = c_q.matches[0].group(3) if os.path.exists(PATH): with open(PATH) as f: view_data = ujson.load(f) search_data = view_data.get(data_key) total = len(search_data) else: return await c_q.answer( "Search data doesn't exists anymore, please perform search again ...", show_alert=True, ) if choosen_btn == "back": index = int(page) - 1 del_back = index == 1 await c_q.answer() back_vid = search_data.get(str(index)) await c_q.edit_message_media( media=( InputMediaPhoto( media=back_vid.get("thumb"), caption=back_vid.get("message"), ) ), reply_markup=yt_search_btns( del_back=del_back, data_key=data_key, page=index, vid=back_vid.get("video_id"), total=total, ), ) elif choosen_btn == "next": index = int(page) + 1 if index > total: return await c_q.answer("That's All Folks !", show_alert=True) await c_q.answer() front_vid = search_data.get(str(index)) await c_q.edit_message_media( media=( InputMediaPhoto( media=front_vid.get("thumb"), caption=front_vid.get("message"), ) ), reply_markup=yt_search_btns( data_key=data_key, page=index, vid=front_vid.get("video_id"), total=total, ), ) elif choosen_btn == "listall": await c_q.answer("View Changed to: 📜 List", show_alert=False) list_res = "" for vid_s in search_data: list_res += search_data.get(vid_s).get("list_view") telegraph = post_to_telegraph( a_title=f"Showing {total} youtube video results for the given query ...", content=list_res, ) await c_q.edit_message_media( media=( InputMediaPhoto( media=search_data.get("1").get("thumb"), ) ), reply_markup=InlineKeyboardMarkup( [ [ InlineKeyboardButton( "↗️ Click To Open", url=telegraph, ) ], [ InlineKeyboardButton( "📰 Detailed View", callback_data=f"ytdl_detail_{data_key}_{page}", ) ], ] ), ) else: # Detailed index = 1 await c_q.answer("View Changed to: 📰 Detailed", show_alert=False) first = search_data.get(str(index)) await c_q.edit_message_media( media=( InputMediaPhoto( media=first.get("thumb"), caption=first.get("message"), ) ), reply_markup=yt_search_btns( del_back=True, data_key=data_key, page=index, vid=first.get("video_id"), total=total, ), )
def test_loadFile(self): f = six.StringIO("[1,2,3,4]") self.assertEqual([1, 2, 3, 4], ujson.load(f))
import ujson as json import sys log_dir = sys.argv[1] if !log_dir: print('Please input log file') exit() with open('spk_info.json') as f: spk = json.load(f) with open(log_dir) as f: score = f.readlines() g_dict = {'FF':[0, 0], 'FM':[0,0], 'MM':[0,0]} for i in range(3000): s1, s2 = score[2*i].split('_')[0:3:2] s1 = spk[s1[:3]] s2 = spk[s2[:3]] sdr = sum(map(float,score[2*i+1][1:-2].split()))/2 if s1 == 'F' and s2 == 'F': g_dict['FF'][0] += sdr g_dict['FF'][1] += 1 elif s1 == 'M' and s2 == 'M': g_dict['MM'][0] += sdr g_dict['MM'][1] += 1 else: g_dict['FM'][0] += sdr
def load(filename): print(f'Opening {filename}') with open(filename, "r") as fh: return json.load(fh)
import lavalink import config import ujson import logging log = logging.getLogger() time_rx = re.compile('[0-9]+') # Languages languages = ["english", "weeb", "tsundere"] lang = {} for l in languages: with open("lang/%s.json" % l) as f: lang[l] = ujson.load(f) def getlang(la: str): return lang.get(la, None) class Audio: def __init__(self, bot): self.bot = bot if not hasattr(bot, 'lavalink'): lavalink.Client(bot=bot, host="0.0.0.0", ws_port=3232, password=config.lavalink['password'],
def process(json_file, outpur_dir, exclude_titles=None, include_titles=None): """ :param json_file: original data in json format :param outpur_dir: the output directory of pre-processed data :param exclude_titles: article titles to exclude :param include_titles: article titles to include """ para_file = "{}/paras".format(outpur_dir) question_file = "{}/questions".format(outpur_dir) sent_file = "{}/sents".format(outpur_dir) answer_file = "{}/answers".format(outpur_dir) print("Generating {} raw data...".format(json_file)) max_sent, max_sent_len, max_que_len, max_ans_len = 0, 0, 0, 0 with open(json_file, "r") as fh, corenlp.CoreNLPClient( annotators="tokenize ssplit pos ner".split(), endpoint="http://localhost:9099", timeout=50000) as client: source = json.load(fh) for article in tqdm(source["data"]): title = article["title"] if include_titles and title not in include_titles: continue if exclude_titles and title in exclude_titles: continue for para in article["paragraphs"]: paragraphs, questions, answers, sents, ids = [], [], [], [], [] paragraphs_pos, questions_pos, answers_pos, sents_pos = [], [], [], [] paragraphs_ner, questions_ner, answers_ner, sents_ner = [], [], [], [] answers_index, sents_index = [], [] # paragraph context = para["context"] if not context.strip(): continue ann_para = client.annotate(context) max_sent = max(max_sent, len(ann_para.sentence)) max_sent_len = max( max_sent_len, max(map(lambda x: len(x.token), ann_para.sentence))) ann_para_tokens, paragraph_tokens, paragraph_pos, paragraph_ner = [], [], [], [] for sent in ann_para.sentence: for token in sent.token: ann_para_tokens.append(token) paragraph_tokens.append(token.word) paragraph_pos.append(token.pos) paragraph_ner.append(token.ner) # questions for qa in para["qas"]: # question ques = qa["question"] id = qa["id"] if not ques.strip(): continue ann_que = client.annotate(ques) max_que_len = max(max_que_len, len(ann_que.sentence[0].token)) question_tokens, question_pos, question_ner = [], [], [] for sent in ann_que.sentence: for token in sent.token: question_tokens.append(token.word) question_pos.append(token.pos) question_ner.append(token.ner) # answer all_answer_tokens, all_answer_pos, all_answer_ner, all_answer_index = [], [], [], [] all_sent_tokens, all_sent_pos, all_sent_ner, all_sent_index = [], [], [], [] for answer in qa["answers"]: answer_text = answer["text"] if not answer_text.strip(): continue ann_ans = client.annotate(answer_text) answer_tokens, answer_pos, answer_ner = [], [], [] for sent in ann_ans.sentence: for token in sent.token: answer_tokens.append(token.word) answer_pos.append(token.pos) answer_ner.append(token.ner) all_answer_tokens.append(' '.join(answer_tokens)) all_answer_pos.append(' '.join(answer_pos)) all_answer_ner.append(' '.join(answer_ner)) answer_start = answer['answer_start'] answer_end = answer_start + len(answer_text) # sentence sentence = [] for sent in ann_para.sentence: if sent.characterOffsetBegin <= answer_start <= sent.characterOffsetEnd or \ sent.characterOffsetBegin <= answer_end <= sent.characterOffsetEnd: sentence.append(sent) sentence = [ token for sent in sentence for token in sent.token ] sentence_tokens = [token.word for token in sentence] sentence_pos = [token.pos for token in sentence] sentence_ner = [token.ner for token in sentence] all_sent_tokens.append(' '.join(sentence_tokens)) all_sent_pos.append(' '.join(sentence_pos)) all_sent_ner.append(' '.join(sentence_ner)) # sentence index y1_sent = sentence[0].tokenBeginIndex y2_sent = sentence[-1].tokenBeginIndex # answer index y1_ans = None for i, token in enumerate(sentence): if token.beginChar - 1 <= answer_start <= token.endChar: y1_ans = sentence[0].tokenBeginIndex + i try: assert y1_ans != None except: continue y2_ans = y1_ans + len(answer_tokens) - 1 all_answer_index.append("{},{}".format(y1_ans, y2_ans)) all_sent_index.append("{},{}".format(y1_sent, y2_sent)) paragraphs.append(' '.join(paragraph_tokens)) paragraphs_pos.append(' '.join(paragraph_pos)) paragraphs_ner.append(' '.join(paragraph_ner)) questions.append(' '.join(question_tokens)) questions_pos.append(' '.join(question_pos)) questions_ner.append(' '.join(question_ner)) answers.append('\t'.join(all_answer_tokens)) answers_pos.append('\t'.join(all_answer_pos)) answers_ner.append('\t'.join(all_answer_ner)) answers_index.append('\t'.join(all_answer_index)) sents.append('\t'.join(all_sent_tokens)) sents_pos.append('\t'.join(all_sent_pos)) sents_ner.append('\t'.join(all_sent_ner)) sents_index.append('\t'.join(all_sent_index)) ids.append(id) # save para with open("{}.tok".format(para_file), 'a') as f: f.write('\n'.join(paragraphs) + '\n') with open("{}.pos".format(para_file), 'a') as f: f.write('\n'.join(paragraphs_pos) + '\n') with open("{}.ner".format(para_file), 'a') as f: f.write('\n'.join(paragraphs_ner) + '\n') with open("{}.id".format(para_file), 'a') as f: f.write('\n'.join(ids) + '\n') # save question with open("{}.tok".format(question_file), 'a') as f: f.write('\n'.join(questions) + '\n') with open("{}.pos".format(question_file), 'a') as f: f.write('\n'.join(questions_pos) + '\n') with open("{}.ner".format(question_file), 'a') as f: f.write('\n'.join(questions_ner) + '\n') # save answer with open("{}.tok".format(answer_file), 'a') as f: f.write('\n'.join(answers) + '\n') with open("{}.pos".format(answer_file), 'a') as f: f.write('\n'.join(answers_pos) + '\n') with open("{}.ner".format(answer_file), 'a') as f: f.write('\n'.join(answers_ner) + '\n') with open("{}.index".format(answer_file), 'a') as f: f.write("\n".join(answers_index) + '\n') # save sent with open("{}.tok".format(sent_file), 'a') as f: f.write('\n'.join(sents) + '\n') with open("{}.pos".format(sent_file), 'a') as f: f.write('\n'.join(sents_pos) + '\n') with open("{}.ner".format(sent_file), 'a') as f: f.write('\n'.join(sents_ner) + '\n') with open("{}.index".format(sent_file), 'a') as f: f.write("\n".join(sents_index) + '\n') # get BIO labels label(para_file, answer_file)
def run(): global _color_id PROTECTED_FILES = ["/main.py", "/boot.py", "/_boot.py"] class FileServerError(Exception): pass def setup_fallback_ap(): unique_id = ubinascii.hexlify(machine.unique_id()).upper().decode() interfaces.ap.active(True) interfaces.ap.config( essid="Kyanit {}".format(unique_id), password="******", authmode=network.AUTH_WPA_WPA2_PSK, ) async def leds_ap_mode(neop): # when fallback AP is active trigger = False while True: trigger = not trigger for idx in range(3): neop[idx] = (0, 0, 64) if idx == 0 and trigger else (0, 0, 0) neop.write() await runner.sleep_ms(250) async def check_wlan_connection(): global _color_id while True: await runner.sleep(30) if not interfaces.wlan.isconnected(): _color_id = "BBB" elif _color_id == "BBB": _color_id = colorid.from_number( int( ure.search("\d+$", interfaces.wlan.ifconfig()[0]).group(0) ) # noqa ) def action_file_list(*args): return httpsrv.response( 200, ujson.dumps( [ path for path in uos.listdir("/") if "\x00" not in path # ignore garbage files and uos.stat(path)[0] == 32768 # noqa and path not in PROTECTED_FILES # noqa ] ), httpsrv.CT_JSON, ) def action_files(method, loc, params, headers, conn, addr): if "/" in loc[7:]: # only files in root dir are allowed raise FileServerError("not on root") file_name = loc[6:] if file_name in PROTECTED_FILES: raise FileServerError("restricted") try: stat = uos.stat(file_name) except OSError: if method == "GET" or method == "DELETE" or "rename" in params: return httpsrv.response(404, '"File Not Found"', httpsrv.CT_JSON) else: if stat[0] != 32768: raise FileServerError("restricted") if method == "DELETE": uos.remove(file_name) return httpsrv.response(200, '"OK"', httpsrv.CT_JSON) if method == "GET": with open(file_name, "rb") as file: # read from file, send to conn httpsrv.send_response( conn, **(httpsrv.response(200, content_type=httpsrv.CT_PLAIN)) ) httpsrv.readall_from(file, into=conn) return None # response already assembled above elif method == "PUT": if "rename" in params: uos.rename(file_name, params["rename"]) return httpsrv.response(200, '"OK"', httpsrv.CT_JSON) with open(file_name, "wb") as file: # write to file, receive from conn httpsrv.readall_from(conn, into=file) return httpsrv.response(200, '"OK"', httpsrv.CT_JSON) async def reboot(): await runner.sleep(.1) print("KYANIT Hard Reset!") machine.reset() def action_reboot(method, loc, params, headers, conn, addr): runner.stop(exc=RebootError) runner.get_event_loop().create_task(reboot()) return httpsrv.response(200, '"OK"', httpsrv.CT_JSON) def action_state(method, loc, params, headers, conn, addr): return httpsrv.response( 200, ujson.dumps( { "unique_id": ubinascii.hexlify( machine.unique_id() ).decode().upper(), "micropython_version": uos.uname().version[ 1:uos.uname().version.index(" ") ], "firmware_version": __version__, "color_id": _color_id, "free_memory": gc.mem_free(), "free_flash": uos.statvfs("/")[0] * uos.statvfs("/")[3], "run_state": [ "ERROR {}".format(runner.get_error()[0]) if runner.get_error() is not None else "", "STOPPED", "CODE.PY MISSING", "CODE.PY IMPORTED", "CODE.PY MAIN", ][runner.get_state()], "error_traceback": [ line.strip() for line in runner.get_error()[1].split("\n") if line and "Traceback" not in line ] if runner.get_error() is not None else None, # noqa } ), httpsrv.CT_JSON, ) def action_runner_start(method, loc, params, headers, conn, addr): runner.start() return httpsrv.response(200, '"OK"', httpsrv.CT_JSON) def action_runner_stop(method, loc, params, headers, conn, addr): runner.stop(force=True if "force" in loc else False, exc=StoppedError) return httpsrv.response(200, '"OK"', httpsrv.CT_JSON) def action_netvar(method, loc, params, headers, conn, addr): if method == "POST": Netvar.inbound(ujson.loads(httpsrv.readall_from(conn).getvalue().decode())) return httpsrv.response(200, '"OK"', httpsrv.CT_JSON) if method == "GET": return httpsrv.response( 200, ujson.dumps(Netvar.outbound()), httpsrv.CT_JSON ) # Start in fallback AP mode if the button is pressed fallback_ap_mode = False button = machine.Signal(machine.Pin(BUTTON_PIN, machine.Pin.IN), invert=True) if button.value(): fallback_ap_mode = True # Try connecting to WLAN if not in fallback AP, else activate AP if not fallback_ap_mode: try: wlan_info = ujson.load(open("/wlan.json")) ssid = wlan_info["ssid"] password = wlan_info["password"] ifconfig = wlan_info["ifconfig"] if "ifconfig" in wlan_info else "dhcp" except Exception: # fall back to AP, if can't get JSON, or malformed fallback_ap_mode = True setup_fallback_ap() else: if not interfaces.wlan_connect( ssid, password, ifconfig=ifconfig, timeout=20 ): # fall back to AP, if can't connect interfaces.wlan.active(False) fallback_ap_mode = True setup_fallback_ap() else: fallback_ap_mode = True setup_fallback_ap() # Show fallback AP mode on LEDs if fallback_ap_mode: neop = neopixel.NeoPixel(machine.Pin(LEDS_PIN), 3) loop = runner.get_event_loop() loop.create_task(leds_ap_mode(neop)) # Set Color ID _color_id = colorid.from_number( int(ure.search("\d+$", interfaces.wlan.ifconfig()[0]).group(0)) # noqa ) # Set up HTTP server http_server = httpsrv.HTTPServer(port=3300) # File actions http_server.register("GET", "^/files$", action_file_list) http_server.register("GET", "^/files/$", action_file_list) http_server.register("GET", "^/files/.*", action_files) http_server.register("PUT", "^/files/.*", action_files) http_server.register("DELETE", "^/files/.*", action_files) # System actions http_server.register("GET", "^/sys/state$", action_state) http_server.register("POST", "^/sys/reboot$", action_reboot) http_server.register("POST", "^/sys/reboot/soft$", action_reboot) # Runner actions http_server.register("POST", "^/sys/start$", action_runner_start) http_server.register("POST", "^/sys/stop$", action_runner_stop) http_server.register("POST", "^/sys/stop/force$", action_runner_stop) # Netvar actions http_server.register("GET", "^/netvar$", action_netvar) http_server.register("POST", "^/netvar$", action_netvar) # RUN loop = runner.get_event_loop() loop.create_task(http_server.catch_requests()) if not fallback_ap_mode: # start code.py if not in fallback AP mode loop.create_task(check_wlan_connection()) loop.create_task(runner.starter_coro()) try: loop.run_forever() except Exception: # close socket, so we can restart http_server.close() raise
def write_emoticon_data(realm_id: int, data_dir: str, output_dir: str) -> List[ZerverFieldsT]: ''' This function does most of the work for processing emoticons, the bulk of which is copying files. We also write a json file with metadata. Finally, we return a list of RealmEmoji dicts to our caller. In our data_dir we have a pretty simple setup: emoticons.json - has very simple metadata on emojis: { "Emoticon": { "id": 9875487, "path": "emoticons/yasss.jpg", "shortcut": "yasss" } }, { "Emoticon": { "id": 718017, "path": "emoticons/yayyyyy.gif", "shortcut": "yayyyyy" } } emoticons/ - contains a bunch of image files: slytherinsnake.gif spanishinquisition.jpg sparkle.png spiderman.gif stableparrot.gif stalkerparrot.gif supergirl.png superman.png We move all the relevant files to Zulip's more nested directory structure. ''' logging.info('Starting to process emoticons') fn = 'emoticons.json' data_file = os.path.join(data_dir, fn) if not os.path.exists(data_file): logging.warning("HipChat export does not contain emoticons.json.") logging.warning("As a result, custom emoji cannot be imported.") return [] with open(data_file) as f: data = ujson.load(f) if isinstance(data, dict) and 'Emoticons' in data: # Handle the hc-migrate export format for emoticons.json. flat_data = [ dict( path=d['path'], name=d['shortcut'], ) for d in data['Emoticons'] ] else: flat_data = [ dict( path=d['Emoticon']['path'], name=d['Emoticon']['shortcut'], ) for d in data ] emoji_folder = os.path.join(output_dir, 'emoji') os.makedirs(emoji_folder, exist_ok=True) def process(data: ZerverFieldsT) -> ZerverFieldsT: source_sub_path = data['path'] source_fn = os.path.basename(source_sub_path) source_path = os.path.join(data_dir, source_sub_path) # Use our template from RealmEmoji # PATH_ID_TEMPLATE = "{realm_id}/emoji/images/{emoji_file_name}" target_fn = source_fn target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format( realm_id=realm_id, emoji_file_name=target_fn, ) target_path = os.path.join(emoji_folder, target_sub_path) os.makedirs(os.path.dirname(target_path), exist_ok=True) source_path = os.path.abspath(source_path) target_path = os.path.abspath(target_path) shutil.copyfile(source_path, target_path) return dict( path=target_path, s3_path=target_path, file_name=target_fn, realm_id=realm_id, name=data['name'], ) emoji_records = list(map(process, flat_data)) create_converted_data_files(emoji_records, output_dir, '/emoji/records.json') realmemoji = [ build_realm_emoji( realm_id=realm_id, name=rec['name'], id=NEXT_ID('realmemoji'), file_name=rec['file_name'], ) for rec in emoji_records ] logging.info('Done processing emoticons') return realmemoji
def run(options): # If this is just being used to download production data, do that. if options.get("just-download", False): download_s3() return # Definitive scan date for the run. today = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # 1. Download scan data, do a new scan, or skip altogether. scan_mode = options.get("scan", "skip") # Whether to gather domains (defaults to doing so). gather_mode = options.get("gather", "here") if scan_mode == "here": # 1a. Gather .gov federal subdomains. if gather_mode == "here": LOGGER.info("Gathering subdomains.") gather_subdomains(options) LOGGER.info("Subdomain gathering complete.") elif gather_mode == "skip": LOGGER.info("Skipping subdomain gathering.") # 1b. Scan subdomains for some types of things. LOGGER.info("Scanning subdomains.") scan_subdomains(options) LOGGER.info("Subdomain scanning complete") # 1c. Scan parent domains for all types of things. LOGGER.info("Scanning parent domains.") scan_parents(options) LOGGER.info("Scan of parent domains complete.") elif scan_mode == "download": LOGGER.info("Downloading latest production scan data from S3.") download_s3() LOGGER.info("Download complete.") # Sanity check to make sure we have what we need. if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")): LOGGER.info("No scan metadata downloaded, aborting.") exit() # Date can be overridden if need be, but defaults to meta.json. if options.get("date", None) is not None: the_date = options.get("date") else: # depends on YYYY-MM-DD coming first in meta.json time format scan_meta = ujson.load(open("data/output/parents/results/meta.json")) the_date = scan_meta['start_time'][0:10] # 2. Process and load data into Pulse's database. LOGGER.info("[%s] Loading data into Pulse." % the_date) data.processing.run(the_date, options) LOGGER.info("[%s] Data now loaded into Pulse." % the_date) # 3. Upload data to S3 (if requested). if options.get("upload", False): LOGGER.info("[%s] Syncing scan data and database to S3." % the_date) upload_s3(the_date) LOGGER.info("[%s] Scan data and database now in S3." % the_date) LOGGER.info("[%s] All done." % the_date)
try: import ujson as json except: import json logger = sv.logger ''' Database for arena likes & dislikes DB is a dict like: { 'md5_id': {'like': set(qq), 'dislike': set(qq)} } ''' DB_PATH = os.path.expanduser('~/.hoshino/arena_db.json') DB = {} try: with open(DB_PATH, encoding='utf8') as f: DB = json.load(f) for k in DB: DB[k] = { 'like': set(DB[k].get('like', set())), 'dislike': set(DB[k].get('dislike', set())) } except FileNotFoundError: logger.warning(f'arena_db.json not found, will create when needed.') def dump_db(): ''' Dump the arena databese. json do not accept set object, this function will help to convert. ''' j = {}
def read_user_data(data_dir: str) -> List[ZerverFieldsT]: fn = 'users.json' data_file = os.path.join(data_dir, fn) with open(data_file) as fp: return ujson.load(fp)
def process_file(filename, data_type, word_counter, char_counter): print(f"Pre-processing {data_type} examples...") examples = [] eval_examples = {} total = 0 with open(filename, "r") as fh: source = json.load(fh) for article in tqdm(source["data"]): for para in article["paragraphs"]: context = para["context"].replace("''", '" ').replace("``", '" ') context_tokens = word_tokenize(context) context_bert_tokens = word_tokenize_bert(context) context_chars = [list(token) for token in context_tokens] spans = convert_idx(context, context_tokens) for token in context_tokens: word_counter[token] += len(para["qas"]) for char in token: char_counter[char] += len(para["qas"]) for qa in para["qas"]: total += 1 ques = qa["question"].replace("''", '" ').replace("``", '" ') ques_tokens = word_tokenize(ques) ques_bert_tokens = word_tokenize_bert(ques) ques_chars = [list(token) for token in ques_tokens] for token in ques_tokens: word_counter[token] += 1 for char in token: char_counter[char] += 1 y1s, y2s = [], [] answer_texts = [] for answer in qa["answers"]: answer_text = answer["text"] answer_start = answer['answer_start'] answer_end = answer_start + len(answer_text) answer_texts.append(answer_text) answer_span = [] for idx, span in enumerate(spans): if not (answer_end <= span[0] or answer_start >= span[1]): answer_span.append(idx) y1, y2 = answer_span[0], answer_span[-1] y1s.append(y1) y2s.append(y2) example = { "context_tokens": context_tokens, "context_bert_tokens": context_bert_tokens, "context_chars": context_chars, "ques_tokens": ques_tokens, "ques_bert_tokens": ques_bert_tokens, "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total } examples.append(example) eval_examples[str(total)] = { "context": context, "question": ques, "spans": spans, "answers": answer_texts, "uuid": qa["id"] } print(f"{len(examples)} questions in total") return examples, eval_examples
def train(config): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.bpe_emb_file, "r") as fh: bpe_mat = np.array(json.load(fh), dtype=np.float32) with open(config.pos_emb_file, "r") as fh: pos_mat = np.array(json.load(fh), dtype=np.float32) with open(config.train_eval_file, "r") as fh: train_eval_file = json.load(fh) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) with open(config.dev_meta, "r") as fh: meta = json.load(fh) dev_total = meta["total"] print("Building model...") parser = get_record_parser(config) train_dataset = get_batch_dataset(config.train_record_file, parser, config) dev_dataset = get_dataset(config.dev_record_file, parser, config) handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle(handle, train_dataset.output_types, train_dataset.output_shapes) train_iterator = train_dataset.make_one_shot_iterator() dev_iterator = dev_dataset.make_one_shot_iterator() model = Model(config, iterator, word_mat, char_mat, bpe_mat, pos_mat) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True loss_save = 100.0 patience = 0 lr = config.init_lr min_lr = config.min_lr with tf.Session(config=sess_config) as sess: writer = tf.summary.FileWriter(config.log_dir) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=None) train_handle = sess.run(train_iterator.string_handle()) dev_handle = sess.run(dev_iterator.string_handle()) sess.run(tf.assign(model.is_train, tf.constant(True, dtype=tf.bool))) sess.run(tf.assign(model.lr, tf.constant(lr, dtype=tf.float32))) for _ in tqdm(range(1, config.num_steps + 1)): global_step = sess.run(model.global_step) + 1 if global_step < config.freeze_steps: loss, train_op = sess.run([model.loss, model.train_op_f], feed_dict={handle: train_handle}) else: if global_step == config.freeze_steps: print('Unfreezing embedding matrices') loss, train_op = sess.run([model.loss, model.train_op], feed_dict={handle: train_handle}) if global_step % config.period == 0: loss_sum = tf.Summary(value=[ tf.Summary.Value(tag="model/loss", simple_value=loss), ]) lr_sum = tf.Summary(value=[ tf.Summary.Value(tag="model/lr", simple_value=lr), ]) writer.add_summary(loss_sum, global_step) writer.add_summary(lr_sum, global_step) if global_step % config.checkpoint == 0: sess.run( tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) _, summ = evaluate_batch(model, config.val_num_batches, train_eval_file, sess, "train", handle, train_handle) for s in summ: writer.add_summary(s, global_step) metrics, summ = evaluate_batch( model, dev_total // config.batch_size + 1, dev_eval_file, sess, "dev", handle, dev_handle) sess.run( tf.assign(model.is_train, tf.constant(True, dtype=tf.bool))) dev_loss = metrics["loss"] if dev_loss < loss_save: loss_save = dev_loss patience = 0 else: patience += 1 if patience >= config.patience and lr > min_lr: lr /= 2.0 loss_save = dev_loss patience = 0 sess.run(tf.assign(model.lr, tf.constant(lr, dtype=tf.float32))) for s in summ: writer.add_summary(s, global_step) writer.flush() filename = os.path.join(config.save_dir, "model_{}.ckpt".format(global_step)) saver.save(sess, filename)
def test_sber(config): prepro_test_sber(config) with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.bpe_emb_file, "r") as fh: bpe_mat = np.array(json.load(fh), dtype=np.float32) with open(config.pos_emb_file, "r") as fh: pos_mat = np.array(json.load(fh), dtype=np.float32) for datafile, datatype in zip( [config.sber_public_file, config.sber_private_file], ['public', 'private']): with open( os.path.join(config.target_dir, "{}_eval.json".format(datatype)), "r") as fh: data_eval_file = json.load(fh) with open( os.path.join(config.target_dir, "{}_meta.json".format(datatype)), "r") as fh: meta = json.load(fh) total = meta["total"] print("Loading model...") test_batch = get_dataset( os.path.join(config.target_dir, "{}.tfrecords".format(datatype)), get_record_parser(config, is_test=True), config).make_one_shot_iterator() model = Model(config, test_batch, word_mat, char_mat, bpe_mat, pos_mat, trainable=False) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if config.model_name == 'latest': checkpoint = tf.train.latest_checkpoint(config.save_dir) else: checkpoint = os.path.join(config.save_dir, config.model_name) print('Restoring from: {}'.format(checkpoint)) saver.restore(sess, checkpoint) sess.run( tf.assign(model.is_train, tf.constant(False, dtype=tf.bool))) answer_dict = {} remapped_dict = {} for step in tqdm(range(total // config.batch_size + 1)): qa_id, loss, yp1, yp2 = sess.run( [model.qa_id, model.loss, model.yp1, model.yp2]) answer_dict_, remapped_dict_ = convert_tokens( data_eval_file, qa_id.tolist(), yp1.tolist(), yp2.tolist()) answer_dict.update(answer_dict_) remapped_dict.update(remapped_dict_) path_to_save_answer = os.path.join( config.answer_dir, '{}.json_squad_ans'.format(datatype)) with open(path_to_save_answer, "w") as fh: json.dump(remapped_dict, fh) sber_ans = '.'.join( path_to_save_answer.split('.')[0:-1]) + '.json_ans' squad_answer2sber(datafile, path_to_save_answer, outfile=sber_ans) print("Answer dumped: {}".format(path_to_save_answer)) tf.reset_default_graph() # evaluating url = 'http://api.aibotbench.com/rusquad/qas' headers = {'Content-Type': 'application/json', 'Accept': 'text/plain'} metrics = dict() f1, em = 0.0, 0.0 for datatype in ['public', 'private']: sber_ans = open( os.path.join(config.answer_dir, '{}.json_ans'.format(datatype)), 'r').readline() res = requests.post(url, data=sber_ans, headers=headers) metrics[datatype] = eval(json.loads(res.text)) f1 += metrics[datatype]['f1'] em += metrics[datatype]['exact_match'] print('{}: EM: {:.5f} F-1: {:.5f}'.format( datatype, metrics[datatype]['exact_match'], metrics[datatype]['f1'])) print('EM avg: {:.5f} F-1 avg: {:.5f}'.format(em / 2, f1 / 2))