def get_rules(rebuild=False): mtime = (publications_file.stat().st_mtime_ns, projects_file.stat().st_mtime_ns) rules = _cached_rules.get(mtime) if rebuild or not rules: _cached_rules.clear() _cached_rules[mtime] = rules = build_rules( json_load(publications_file), json_load(projects_file)) threading.Thread(target=validate_permissions, args=(rules, )).start() return rules
def make_dataframe(path): threads = json_load(path) all_messages = [] for i, th in enumerate(threads): h = th['head'] if len(th['tail']) == 0: continue all_messages.append( (h['id'], i, h['title'], (h['body'] or '') + ' ' + make_label_string(h['labels']), h['sender'], h['created_at']) ) for t in th['tail']: all_messages.append( (t['id'], i, 'RE: ' + h['title'], t['body'], t['sender'], t['created_at']) ) return pd.DataFrame(all_messages, columns=['message_id', KEY_THREAD_ID, 'subject', 'body', KEY_SENDER_ID, KEY_TIMESTAMP])
def add_sc_locale_to_history(repo_dir): db = get_db() source_dir = pathlib.Path(repo_dir) / 'suttacentral/client/localization/elements/' docs = {} files = list(source_dir.glob('*.json')) lang = 'en' for file in files: entries = json_load(file) for k, v in entries.items(): context = f'{file.name.split("_")[0]}_{k}' if context in docs: doc = docs[context] else: doc = { '_key': f'sc_latest_{context}', 'context': context, 'origin': 'sc_latest', 'strings': {} } docs[context] = doc doc['strings'][lang] = v errors = db['historic'].import_bulk(docs.values(), on_duplicate='replace', halt_on_error=False) return files, errors
def open_cache(suite, prefix): try: with open(os.path.join(awfy.path, prefix + '.json')) as fp: cache = util.json_load(fp) return cache['graph'] except: return {'timelist': [], 'lines': [], 'direction': suite.direction}
def update_site(reference=None): db = get_db() db.collection('historic').truncate() temp_dir_old = clone_repo(branch_or_tag='prelocale', reference=reference) add_old_sc_locale_to_history(temp_dir_old.name) temp_dir_latest = clone_repo('master', reference=reference) files, _ = add_sc_locale_to_history(temp_dir_latest.name) add_site_to_history() # A mapping of pathlib.Path to the data contained within the file root_files_and_data = {} for file in files: stem = file.stem.split('_')[0] new_name = f'{stem}_root-en-site.json' new_file = pathlib.Path('root/en/site') / new_name data = json_load(file) root_files_and_data[new_file] = data temp_dir_old.cleanup() temp_dir_latest.cleanup() print("Waiting for ArangoDB to complete view update") db.aql.execute(STALL_QUERY) print('Restoring site projects from history') files_and_data = restore_site_from_history(root_files_and_data) files_and_data.update(root_files_and_data) rewrite_site_projects(files_and_data)
def main(): opt = json_load('./option.json') logging.basicConfig(filename=opt.PREFIX + '.log', level=logging.DEBUG, format='%(asctime)s %(message)s') logger = logging.getLogger('default_logger') logger.addHandler(logging.StreamHandler()) with tf.Session() as sess: env = Environment(opt.ACTION_REPEAT) age = Agent(env, sess, logger) sess.run(tf.global_variables_initializer()) start_episode = 12345 # if load past if start_episode > 0: age.load(meta_graph='./save/' + opt.PREFIX + '-%s.meta' % start_episode, step=start_episode) #age.save(True)# else: age.save(True) age.train(start_episode=0) for epi in range(opt.TEST_EPISODE_MAX): reward = age.play() debug(logger, 'Test episode %d got reward %d' % (epi, reward))
def open_cache(suite, prefix): try: with open(os.path.join(awfy.path, prefix + ".json")) as fp: cache = util.json_load(fp) return cache["graph"] except: return {"timelist": [], "lines": [], "direction": suite.direction}
def __init__(self): self.userName = "******".format( studentId=config.get("Joyrun", "StudentID"), suffix=config.get("Joyrun", "suffix")) self.password = config.get("Joyrun", "Password") try: cache = json_load(self.Cache_LoginInfo) except (FileNotFoundError, JSONDecodeError): cache = {} if cache.get("userName") == self.userName: self.uid = cache.get("uid", 0) self.sid = cache.get("sid", '') else: # userName 不匹配,则不使用缓存信息 self.uid = 0 self.sid = '' self.session = requests.Session() self.session.headers.update(self.base_headers) self.session.headers.update(self.device_info_headers) self.auth = JoyrunAuth(self.uid, self.sid) if self.uid and self.sid: # 直接从缓存中读取登录状态 self.__update_loginInfo() else: self.login() # 否则重新登录
def __init__(self): # filenames is the key which connecting image and text for pair self.train_file_names = u.pickle_load(c.train_filename_path) self.test_file_names = u.pickle_load(c.test_filename_path) self.train_file_names = sorted(self.train_file_names) self.test_file_names = sorted(self.test_file_names) self.train_class_id = c.train_class_info self.test_class_id = c.test_class_info # self.train_class_id = u.pickle_load2(c.train_class_id_txt_path) # self.test_class_id = u.pickle_load2(c.test_class_id_txt_path) # text self.tokenizer = RegexpTokenizer(r'\w+') if not os.path.isfile(c.vocab_path): train_text = self.__load_all_text(self.train_file_names) test_text = self.__load_all_text(self.test_file_names) all_text = train_text + test_text self.__word_count_statistics(all_text) vocab = v.vocab() vocab.create(all_text) self.word_2_index = u.json_load(c.vocab_path) self.index_2_word = u.json_load(c.index_2_word_path) # self.index_2_word = {v: k for k, v in self.word_2_index.items()} self.vocab_size = len(self.word_2_index) print("vocab_size : ", self.vocab_size) # image self.base_img_size = [64, 128, 256] label_image_size = 256 rate = 76 / 64 self.image_transform = transforms.Compose([ transforms.Resize(int(label_image_size * rate)), transforms.RandomCrop(label_image_size), transforms.RandomHorizontalFlip() ]) self.norm = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) self.filenames_bbox = self.__load_bbox()
def load_metadata(prefix): try: with open(os.path.join(awfy.path, "metadata-" + prefix + ".json"), "r") as fp: cache = util.json_load(fp) except: cache = {"earliest_run_id": 0} return cache
def load_metadata(prefix): try: with open(os.path.join(awfy.path, 'metadata-' + prefix + '.json'), 'r') as fp: cache = util.json_load(fp) except: cache = { 'last_stamp': 0 } return cache
def __build(self): """ 构造记录 """ points_per_loop = json_load(self.A_Loop_GPS_JSON) self.date = self.__get_date() self.duration = int(self.distance * self.Distance_Per_Loop / 0.4 * (self.pace + self.__pace_delta()) * 60) # 跑步时间/s self.step = int((self.stride_frequncy + self.__stride_frequncy_delta()) * self.duration / 60) # 总步数 self.detail = list(self.__point_generator(points_per_loop))
def get_rules(): mtime = publications_file.stat().st_mtime_ns rules = _cached_rules.get(mtime) if not rules: _cached_rules.clear() _cached_rules[mtime] = rules = build_rules( json_load(publications_file)) threading.Thread(target=validate_permissions, args=(rules, )).start() return rules
def open_cache(suite, prefix): try: with open(os.path.join(awfy.path, prefix + '.json')) as fp: cache = util.json_load(fp) return cache['graph'] except: return { 'timelist': [], 'lines': [], 'direction': suite.direction }
def count_strings(self, entry): json_file = get_file(entry["path"]) data = json_load(json_file) count = 0 for k, v in data.items(): if k == "_meta": continue if v: count += 1 return count
def load_json(entry): _meta = entry.get("_meta", {}) _meta["path"] = entry.get("path") json_file = get_file(entry["path"]) if json_file.exists(): segments = json_load(json_file) else: segments = {} return {**deepcopy(_meta), "segments": segments}
def main(): import os import cPickle as pkl import pandas as pd from util import json_load from max_cover import k_best_trees import argparse parser = argparse.ArgumentParser('Evaluate the events') parser.add_argument('-c', '--cand_trees_path', required=True, nargs='+') parser.add_argument('--interactions_path', required=True) parser.add_argument('--events_path', required=True) args = parser.parse_args() interactions = json_load(args.interactions_path) true_events = json_load(args.events_path) methods = [metrics.adjusted_rand_score, metrics.adjusted_mutual_info_score, metrics.homogeneity_score, metrics.completeness_score, metrics.v_measure_score] K = 10 indexes = [] scores = [] for p in args.cand_trees_path: cand_trees = pkl.load(open(p)) pred_trees = k_best_trees(cand_trees, K) indexes.append(os.path.basename(p)) scores.append(evaluate_meta_tree_result( true_events, pred_trees, [i['message_id'] for i in interactions], methods )) df = pd.DataFrame(scores, index=indexes, columns=[m.__name__ for m in methods] + [m.__name__ + "(all)" for m in methods] + ['precision', 'recall', 'f1']) df.to_csv('tmp/evaluation.csv')
def get_topic_meta_graph_from_synthetic(cls, path, preprune_secs, **kwargs): return cls.get_topic_meta_graph(json_load(path), cosine, preprune_secs=preprune_secs, decompose_interactions=False, given_topics=True, convert_time=False, **kwargs )
class Memories(PowerDict, Capability): personal = json_load(os.path.join(REFERENCE_DIR, 'personal.json')) def __init__(self, info={}, person=None): PowerDict.__init__(self, info) self.update(Memories.calculated(self)) Capability.__init__(self, person) @classmethod def calculated(cls, self): return { 'full_name': self['first_name'] + " " + self['last_name'], 'unique_id': self['first_name'] + "_" + self['last_name'] } def run_phrase(self): return "" # TODO: Alex def __json__(self): return self @classmethod def from_json(cls, obj): return cls(obj) @classmethod def random_gender(cls): return first_sample(cls.personal['gender']).lower() @classmethod def names(cls): return cls.personal['names'] @classmethod def random_first_name(cls, gender): names = cls.names() if gender in names: pool = names[gender] else: pool = names['male'] + names['female'] return first_sample(pool) @classmethod def random_last_name(cls): return first_sample(cls.personal['surnames']) @classmethod def random(cls): mem = cls() mem.gender = cls.random_gender() mem.first_name = cls.random_first_name(mem.gender) mem.last_name = cls.random_last_name() return mem
def main(): import os import cPickle as pkl import pandas as pd from util import json_load from max_cover import k_best_trees import argparse parser = argparse.ArgumentParser('Evaluate the events') parser.add_argument('-c', '--cand_trees_path', required=True, nargs='+') parser.add_argument('--interactions_path', required=True) parser.add_argument('--events_path', required=True) args = parser.parse_args() interactions = json_load(args.interactions_path) true_events = json_load(args.events_path) methods = [ metrics.adjusted_rand_score, metrics.adjusted_mutual_info_score, metrics.homogeneity_score, metrics.completeness_score, metrics.v_measure_score ] K = 10 indexes = [] scores = [] for p in args.cand_trees_path: cand_trees = pkl.load(open(p)) pred_trees = k_best_trees(cand_trees, K) indexes.append(os.path.basename(p)) scores.append( evaluate_meta_tree_result(true_events, pred_trees, [i['message_id'] for i in interactions], methods)) df = pd.DataFrame(scores, index=indexes, columns=[m.__name__ for m in methods] + [m.__name__ + "(all)" for m in methods] + ['precision', 'recall', 'f1']) df.to_csv('tmp/evaluation.csv')
def load(cls, unique_id): folder = os.path.join(PEOPLE_DIR, unique_id) # check if they exist if not os.path.exists(folder): return None caps = {} for k, v in Person.cap_classes.iteritems(): filepath = os.path.join(folder, k + '.json') if os.path.exists(filepath): caps[k] = json_load(filepath, cls=v) else: caps[k] = v() return cls(caps)
def __point_generator(self): """ GPS 坐标 生成器 无限循环跑圈,并对每个点都引入一个偏差量 """ points_per_loop = json_load(self.A_Loop_GPS_JSON) points_num_per_loop = int(self.pace * 0.4 * 60 / self.sampleinterval) while True: for i in range(points_num_per_loop): idx = math.floor(i / points_num_per_loop * len(points_per_loop)) point = points_per_loop[idx] # .copy() point[0] += self.__point_delta() point[1] += self.__point_delta() yield point.copy() # 一定要 copy 否则会同步修改
def main(): opt = json_load('./option.json') logging.basicConfig(filename=opt.PREFIX+'.log', level=logging.DEBUG, format='%(asctime)s %(message)s') with tf.Session() as sess: env = Environment() age = Agent(env, sess) sess.run(tf.global_variables_initializer()) start_episode = 0 # if load past if start_episode > 0: age.load(meta_graph='./save/BREAKOUT_DQN-%s.meta' % start_episode, step=start_episode)#age.save(True)# age.train(start_episode=start_episode)#age.train()# else: age.save(True) age.train(start_episode=0) for epi in range(opt.TEST_EPISODE_MAX): reward = age.play() print('Test episode %d got reward %d' % (epi, reward)) debug('Test episode %d got reward %d' % (epi, reward))
def find_all_months(cx, prefix, name): pattern = prefix + 'raw-' + name + '-(\d\d\d\d)-(\d+)\.json' files = [] for file in os.listdir(awfy.path): m = re.match(pattern, file) if not m: continue year = int(m.group(1)) month = int(m.group(2)) files.append(((year, month), file)) files = sorted(files, key=lambda key: key[0][0] * 12 + key[0][1]) graphs = [] for when, file in files: with open(os.path.join(awfy.path, file)) as fp: cache = util.json_load(fp) graphs.append((when, cache['graph'])) return graphs
def get_token(self, refresh=False): """ 如果 token 没有过期,则返回缓存 token ,否则重新登录 Args: refresh bool 是否立即重新登录并刷新缓存(默认 false) Returns: token str token 字符串 """ try: if refresh: token = self.__login() else: tokenCache = json_load(self.Cache_AccessToken) if tokenCache["expire_in"] < time.time(): token = self.__login() else: token = tokenCache["token"] except (FileNotFoundError, JSONDecodeError): # 无缓存文件或者文件为空 token = self.__login() finally: return token
def add_old_sc_locale_to_history(repo_dir): """ Historically SuttaCentral contained both roots and translations for the site localization """ db = get_db() source_dir = pathlib.Path(repo_dir) / 'suttacentral/client/localization/elements/' docs = {} for folder in source_dir.glob('*'): if not folder.is_dir(): continue for file in folder.glob('*.json'): lang = file.stem data = json_load(file) entries = data[lang] for k, v in entries.items(): context = f'{folder.name}_{k}' if context in docs: doc = docs[context] else: doc = { '_key': f'sc_old_{context}', 'context': context, 'origin': 'sc_old', 'strings': {} } docs[context] = doc doc['strings'][lang] = v errors = db['historic'].import_bulk(docs.values(), on_duplicate='replace', halt_on_error=False) return errors
def __init__(self): self.userName = "******" % config.get("Joyrun", "StudentID") self.password = config.get("Joyrun", "Password") try: cache = json_load(self.Cache_LoginInfo) except (FileNotFoundError, JSONDecodeError): cache = {} self.uid = cache.get("uid", 0) self.sid = cache.get("sid", '') self.session = requests.Session() self.session.headers.update(self.base_headers) self.session.headers.update(self.device_info_headers) self.auth = JoyrunAuth(self.uid, self.sid) if self.uid and self.sid: # 直接从缓存中读取登录状态 self.__update_loginInfo() else: self.login() # 否则重新登录
def retrieve_graph(cx, file): with open(os.path.join(awfy.path, file)) as fp: cache = util.json_load(fp) return cache['graph']
def recurse(folder, meta_definitions=None): subtree = {} meta_definitions = meta_definitions.copy() metafiles = set(folder.glob("_*.json")) if metafiles: for metafile in sorted(metafiles, key=humansortkey): file_data = json_load(metafile) meta_definitions.update(file_data) for k, v in file_data.items(): if k not in _meta_definitions: _meta_definitions[k] = v for file in sorted(folder.glob("*"), key=humansortkey): if file.name.startswith("."): continue if file in metafiles: continue long_id = file.stem meta = {} for part in file.parts: if part.endswith(".json"): part = part[:-5] if part in meta_definitions: meta[part] = meta_definitions[part] if file.is_dir(): subtree[file.name] = recurse(file, meta_definitions=meta_definitions) subtree[file.name]["_meta"] = meta elif file.suffix == ".json": mtime = file.stat().st_mtime_ns path = str(file.relative_to(WORKING_DIR)) obj = subtree[long_id] = { "path": path, "mtime": mtime, "_meta": meta } if "_" in long_id: uid, muids = get_uid_and_muids(file) else: uid = file.name if file.is_dir() else file.stem muids = None obj["uid"] = uid if uid not in uid_index: uid_index[uid] = set() uid_index[uid].add(long_id) if long_id in file_index: logging.error(f"{str(file)} not unique") file_index[long_id] = obj if muids: for muid in muids: if muid not in muid_index: muid_index[muid] = set() muid_index[muid].add(long_id) # Create Virtual Files if 'translation' in muids: uid, muids = long_id.split('_') muids = muids.replace('translation', 'comment') comment_stem = f"{uid}_{muids}" if comment_stem in uid_index: continue parent = pathlib.Path('comment') / file.relative_to( WORKING_DIR / 'translation').parent virtual_file = parent / (comment_stem + '.json') meta = { part: meta_definitions[part] for part in muids.split('-') if part in meta_definitions } obj = { "uid": uid, "path": str(virtual_file), "mtime": None, "_meta": meta } uid_index[uid].add(comment_stem) file_index[comment_stem] = obj for muid in muids.split('-'): muid_index[muid].add(comment_stem) return subtree
def recurse(folder, meta_definitions=None, depth=0): subtree = {} meta_definitions = meta_definitions.copy() metafiles = set(folder.glob("_*.json")) if metafiles: for metafile in sorted(metafiles, key=humansortkey): file_data = json_load(metafile) if isinstance(file_data, dict): meta_definitions.update(file_data) for k, v in file_data.items(): if k not in _meta_definitions: _meta_definitions[k] = v for file in sorted(folder.glob("*"), key=humansortkey): if file.name.startswith("."): continue if file in metafiles: continue long_id = file.stem meta = {} for part in file.parts: if part.endswith(".json"): part = part[:-5] if part in meta_definitions: meta[part] = meta_definitions[part] if file.is_dir(): subtree[file.name] = recurse( file, meta_definitions=meta_definitions, depth=depth + 1) subtree[file.name]["_meta"] = meta elif file.suffix == ".json": mtime = file.stat().st_mtime_ns path = str(file.relative_to(WORKING_DIR)) obj = subtree[long_id] = { "path": path, "mtime": mtime, "_meta": meta } if "_" in long_id: uid, muids = get_uid_and_muids(file) else: uid = file.name if file.is_dir() else file.stem muids = None obj["uid"] = uid if uid not in uid_index: uid_index[uid] = set() uid_index[uid].add(long_id) if long_id in file_index: logging.error(f"{str(file)} not unique") file_index[long_id] = obj if muids: for muid in muids: if muid not in muid_index: muid_index[muid] = set() muid_index[muid].add(long_id) # Create Virtual Files if 'translation' in muids: uid, muids = long_id.split('_') _add_virtual_comment_file(uid, muids, file, uid_index, muid_index, file_index, meta_definitions) if depth == 0: _add_virtual_project_files(uid_index, muid_index, file_index, subtree, _meta_definitions) return subtree
def get_interaction_ids(path): return [i["message_id"] for i in json_load(path)]
def from_str(cls, s): data = json_load(s) t = Transaction(**data['transaction']) return cls(t, data['data'])
import gensim import networkx as nx from scipy.spatial.distance import cosine from datetime import timedelta from interactions import InteractionsUtil as IU from util import json_load interactions = json_load('data/enron/interactions.json') lda_model = gensim.models.ldamodel.LdaModel.load( 'data/enron/model-50-50.lda' ) dictionary = gensim.corpora.dictionary.Dictionary.load( 'data/enron/dict.pkl' ) different_weights = [ {'topics': 0.2, 'bow': 0.8}, {'topics': 1.0}, {'bow': 1.0}, ] for weights in different_weights: meta_graph_kws = { 'distance_weights': weights, } g = IU.get_topic_meta_graph(
def get_interaction_ids(path): return [i['message_id'] for i in json_load(path)]
def load_dataset(conf, lang, bert=None): if conf.best_vocab_size: conf.vocab_size = json_load(conf.best_vocab_size_file)[conf.lang] data = datasets[conf.dataset].load(conf, lang, bert=bert) data.describe() return data
def test_convert_json(): for filename in glob.glob(os.path.join(SCRIPT_DIR, '*.json')): example = json_load(filename) normalize_dict(example) yield assert_converts, example
def running_record(self): """ 构造一次跑步记录 """ points_per_loop = json_load(self.A_Loop_GPS_JSON) # 一圈的坐标 distance = config.getfloat("PB", "distance") # 总距离 km pace = config.getfloat("PB", "pace") # 速度 min/km stride_frequncy = config.getint("PB", "stride_frequncy") # 步频 步/min duration = distance * pace * 60 # 用时 s cal_per_loop = lambda: 20 + random.random() * (23 - 20) # 20-23 每圈 point_delta = lambda: (random.random() - 0.5) * 0.00005 # 随机坐标偏差 distance_delta_rate = lambda: 1 + (random.random() - 0.5 ) * 0.1 # 0.95 - 1.05 的距离随机倍率 stride_frequncy_delta = lambda: int( (random.random() - 0.5) * 2 * 10) # 10步/min 的随机步频偏差 random_alt = lambda: round(42 + random.random() * (48 - 42), 1) # 42-48 海拔 random_speed = lambda: round(3.1 + random.random() * (4.4 - 3.1), 2) # 没搞懂 speed 怎么定义的 ... def locus_generator(): end_time = int(time.time() * 1000) start_time = now_time = end_time - int(duration * 1000) # 从开始时间起 now_stepcount = 0 now_distance = 0.00 while now_time <= end_time: for point in points_per_loop: per_distance = 0.4 / len( points_per_loop) * distance_delta_rate() # 两点间距离 km now_stepcount += int( (stride_frequncy + stride_frequncy_delta()) * per_distance * pace) now_distance += per_distance yield { # "id": ??? # 拿不到主键,但是不加主键也可以提交成功,数据库应该设置了主键自增长 "alt": random_alt(), "speed": random_speed(), "heartrate": 0, "distance": now_distance, "lat": round(point['lat'] + point_delta(), 8), "lng": round(point['lng'] + point_delta(), 8), "stepcount": now_stepcount, "traintime": now_time, } now_time += int(per_distance * pace * 60 * 1000) # 时间间隔 ms locuslist = list(locus_generator()) distance = locuslist[-1]['distance'] # 实际的距离 km duration = (locuslist[-1]['traintime'] - locuslist[0]['traintime']) / 1000 # 实际的用时 s return json.dumps({ "biggerId": self.biggerId, "token": self.token, "locusrlist": [{ "cal": int(cal_per_loop() * distance * 1000 / 400), "distance": round(distance, 2), "duration": int(duration), "heartrate": 0, "team": 1, # "pace": ??? # 此处的 pace 与跑步记录中的 pace 含义不统一 # "intermittent": ??? "locuslist": [locuslist], }], }).encode('utf-8')
def update_segment(segment, user): """ segment looks like: { "segmentId": "dn1:1.1", "field": "translation-en-sujato", "value": "..", "oldValue": "..." } """ segment_id = segment["segmentId"] if not is_id_legal(segment_id): logging.error(f"Invalid Segment ID: {segment_id}") return {"error": "Invalid Segment ID"} uid, _ = segment_id.split(":") parent_uid = get_parent_uid(uid) long_id = f'{parent_uid}_{segment["field"]}' try: filepath = get_file_path(long_id) except KeyError as e: logging.exception(e) logging.error('f"{long_id}" not found, {segment}') return {"error": "file not found"} file = get_file(filepath) permission = get_permissions(filepath, user['login']) if permission != Permission.EDIT: logging.error("User not allowed to edit") return {"error": "Inadequate Permission"} with git_fs._lock: try: file_data = json_load(file) except FileNotFoundError: file.parent.mkdir(parents=True, exist_ok=True) file_data = {} current_value = file_data.get(segment_id) result = {} if current_value and current_value != segment.get("oldValue"): result["clobbered"] = current_value if current_value != segment["value"]: result["changed"] = True file_data[segment_id] = segment["value"] sorted_data = dict(sorted(file_data.items(), key=bilarasortkey)) try: json_save(sorted_data, file) result["success"] = True except Exception: logging.exception(f"could not write segment: {segment}") return {"error": "could not write file"} executor.submit(background_update, filepath, user, segment) return result
# filter out articles with single company tag if has_multiple_companies(a): valid_articles.append( transform_article(a) ) return valid_articles def dump2interactions(db, collection_name, output_path): articles = articles_articles(db, collection_name) print('# valid articles: ', len(articles)) json_dump(articles, output_path) return articles def collect_people_info(articles): participant_ids = set( itertools.chain(*[a['participant_ids'] for a in articles]) ) print('# unique participants: ', len(participant_ids)) return [{'id': p} for p in participant_ids] if __name__ == '__main__': # articles = dump2interactions(MongoClient()['bloomberg'], # 'articles', # 'data/bloomberg/interactions.json') articles = json_load('data/bloomberg/interactions.json') json_dump(collect_people_info(articles), 'data/bloomberg/people.json')