def expand_videos(msid, video): gc_data = metadata(msid) # cached on first hit gc_id_str = ", ".join(gc_data.keys()) v_id = video['id'] ensure(v_id in gc_data, "glencoe doesn't know %r, only %r" % (v_id, gc_id_str)) video_data = gc_data[v_id] video_data = utils.subdict(video_data, ['jpg_href', 'width', 'height']) video_data = utils.renkeys(video_data, [('jpg_href', 'image')]) func = lambda mtype: { 'mediaType': SOURCES[mtype], 'uri': gc_data[v_id][mtype + "_href"] } video_data['sources'] = lmap(func, SOURCES) video.update(video_data) del video['uri'] # returned by elife-tools, not part of spec # Add placeholder, the video thumbnail image video["placeholder"] = {} video["placeholder"]["uri"] = video["image"].split('/')[-1] video["placeholder"]["alt"] = "" return video
def expand_location(path): # if isinstance(path, file): if is_file(path): path = path.name if path.startswith('https://s3-external-1.amazonaws.com/') or path.startswith('https://s3.amazonaws.com/'): # it's being downloaded from a bucket, no worries return path # resolve any symlinks # the backfill uses symlinks to the article-xml dir path = os.path.abspath(os.path.realpath(path)) if re.match(r".*article-xml/articles/.+\.xml$", path): # this article is coming from the local ./article-xml/ directory, which # is almost certainly a git checkout. we want a location that looks like: # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"]) ensure(rc == 0, "failed to read the contents of './elife-article-xml.sha1'") sha = rawsha.strip() fname = os.path.basename(path) return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % (sha, fname) # who knows what this path is ... LOG.warn("scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax", path) return path
def expand_location(path): # if isinstance(path, file): if is_file(path): path = path.name if path.startswith('https://s3-external-1.amazonaws.com/' ) or path.startswith('https://s3.amazonaws.com/'): # it's being downloaded from a bucket, no worries return path # resolve any symlinks # the backfill uses symlinks to the article-xml dir path = os.path.abspath(os.path.realpath(path)) if re.match(r".*article-xml/articles/.+\.xml$", path): # this article is coming from the local ./article-xml/ directory, which # is almost certainly a git checkout. we want a location that looks like: # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"]) ensure(rc == 0, "failed to read the contents of './elife-article-xml.sha1'") sha = rawsha.strip() fname = os.path.basename(path) return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % ( sha, fname) # who knows what this path is ... LOG.warn( "scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax", path) return path
def add_connection(self, from_room, to_room): ensure(from_room >= 0 and from_room < len(self.nodes), 'from_room out of bounds') ensure(to_room >= 0 and to_room < len(self.nodes), 'to_room out of bounds') self.connections[from_room, to_room] = distance.euclidean( self.nodes[from_room].center, self.nodes[to_room].center)
def expand_videos(msid, video): gc_data = metadata(msid) # cached on first hit gc_id_str = ", ".join(gc_data.keys()) v_id = video['id'] ensure(gc_data, "glencoe doesn't know %r, it doesn't have any media") ensure(v_id in gc_data, "glencoe doesn't know %r, only %r" % (v_id, gc_id_str)) video_data = gc_data[v_id] video_data = utils.subdict(video_data, ['jpg_href', 'width', 'height']) video_data = utils.renkeys(video_data, [('jpg_href', 'image')]) func = lambda mtype: OrderedDict([ ('mediaType', SOURCES[mtype]), ('uri', gc_data[v_id][mtype + "_href"]) ]) video_data['sources'] = lmap(func, SOURCES) video.update(video_data) del video['uri'] # returned by elife-tools, not part of spec # Add placeholder, the video thumbnail image video["placeholder"] = OrderedDict() video["placeholder"]["uri"] = video["image"].split('/')[-1] video["placeholder"]["alt"] = "" return video
def triangulate(self): ensure( len(self.nodes) >= 4, 'Cannot triangulate with less than four rooms') self.reset_connections() points = [] self.lookup = [] i = 0 for id, n in enumerate(self.nodes): if n.is_main: points.append(n.center) self.lookup.append(id) i += 1 tri = Delaunay(points) for simplex in tri.simplices: self.add_connection(self.lookup[simplex[0]], self.lookup[simplex[1]]) self.add_connection(self.lookup[simplex[1]], self.lookup[simplex[2]]) self.add_connection(self.lookup[simplex[2]], self.lookup[simplex[0]]) self.triangulation = tri
def file_handler(path): ensure( path.startswith(PROJECT_DIR), "unsafe operation - refusing to read from a file location outside of project root. %r does not start with %r" % (path, PROJECT_DIR)) xml = open(path, 'r').read() # write cache? return xml
def splitter(string): if isinstance(string, list): pair = string # already split into pairs, return what we have return pair ensure('|' in string, "override key and value must be seperated by a pipe '|'") first, rest = string.split('|', 1) ensure(rest.strip(), "a value must be provided. use 'null' without quotes to use an empty value") return first, rest
def manual_overrides(ctx, data): "replace top-level article keys with new values provided in ctx.override" overrides = ctx.get('override', {}) ensure(isinstance(overrides, dict), "given mapping of overrides is not a dictionary") # possibly add support for dotted paths in future? for key, value in overrides.items(): data['article'][key] = value return data
def test_divide(): l1 = ['/', 2, 2] l2 = ['/', 30, 2, ['+', 1, 2]] # log(Apply().divide(l1)) # log(Apply().divide(l2)) ensure(Apply().divide(l1) == 1, 'divide 测试1') ensure(Apply().divide(l2) == 5, 'divide 测试2')
def validate_response(response): utils.validate(response, conf.RESPONSE_SCHEMA) max_size_in_bytes = 262144 response_size = len(serialise_response(response)) ensure( response_size <= max_size_in_bytes, "response size (%s) is too large: %s" % (response_size, max_size_in_bytes), ValidationError)
def test_call_function(): d1 = [['def', 'f1', ['a', 'b'], [['if', ['<', 'a', 0], 3, 'b']]]] apply = Apply() apply.apply_trees(d1) # print('func', self.func) l1 = ['call', 'f1', [1, 2]] # log('var', apply.var) # log('func', apply.func) # log('func result', apply.call_func(l1)) ensure(apply.call_func(l1) == 2, 'call_function 测试1')
def download(location): "download file, convert and pipe content straight into lax + transparent cache" ensure('://' in location[:10], 'no protocol found in %r, failing' % location) protocol, path = location.split('://') downloaderficationer = { 'https': lambda: http_download(location), # load files relative to adaptor root 'file': partial(file_handler, path) } file_contents = downloaderficationer[protocol]() return file_contents
def splitter(string): if isinstance(string, list): pair = string # already split into pairs, return what we have return pair ensure('|' in string, "override key and value must be seperated by a pipe '|'") first, rest = string.split('|', 1) ensure( rest.strip(), "a value must be provided. use 'null' without quotes to use an empty value" ) return first, rest
def validate_gc_data(gc_data): # we've had one case like this ensure(gc_data != {}, "glencoe returned successfully, but response is empty") # we also can't guarantee all of the sources will always be present known_sources = SOURCES.keys() for v_id, v_data in gc_data.items(): available_sources = lfilter(lambda mtype: mtype + "_href" in v_data, known_sources) # fail if we have partial data msg = "number of available sources less than known sources for %r. missing: %s" % \ (v_id, ", ".join(set(known_sources) - set(available_sources))) ensure(len(available_sources) == len(known_sources), msg)
def test_apply_trees(): # 这是最顶层的函数,传入抽象语法树 l1 = [['+', 1, 2, ['-', 2, 1]]] l2 = [['-', 2, 2], ['-', 2, 1]] l3 = [['var', 'a', ['-', 2, 1]]] l4 = [['var', 'a', 1], ['var', 'b', ['+', 1, 1]], ['if', ['<', 'a', 0], 3, 'b']] l5 = [['var', 'a', 3], ['var', 'b', 2], ['def', 'f1', ['a', 'b'], [['-', ['+', 'a', 2], 3, 'b']]], ['call', 'f1', ['a', 'b']]] # ensure(Apply().apply_trees(l1) == 4, 'apply_trees 测试1') # ensure(Apply().apply_trees(l2) == 1, 'apply_trees 测试2') # ensure(Apply().apply_trees(l3) == 'N/A', 'apply_trees 测试3') # ensure(Apply().apply_trees(l4) == 2, 'apply_trees 测试4') ensure(Apply().apply_trees(l5) == 0, 'apply_trees 测试5')
def validate_gc_data(gc_data): # we've had one case like this ensure(gc_data != {}, "glencoe returned successfully, but response is empty") # we also can't guarantee all of the sources will always be present known_sources = SOURCES.keys() for v_id, v_data in gc_data.items(): available_sources = lfilter(lambda mtype: mtype + "_href" in v_data, known_sources) # fail if we have partial data msg = "number of available sources less than known sources for %r. missing: %s" % \ (v_id, ", ".join(set(known_sources) - set(available_sources))) assert len(available_sources) == len(known_sources), msg
def test(): # test case, arr1 = [2, 3, 1, 0, 2, 5, 3] arr2 = [5, 4, 3, 2, 1, 0] arr3 = [12, 3] arr4 = [] ensure((True, 2) == duplicate(arr1), 'arr1') ensure((False,) == duplicate(arr2), 'arr2') ensure((False,) == duplicate(arr3), 'arr3') ensure((False,) == duplicate(arr4), 'arr4')
def random_edges(self, percentage): ensure(self.triangulation, 'You must triangulate before creating random edges') s_count = len(self.triangulation.simplices) rnd_count = int(s_count * 3 * percentage) rnd = s_count * np.random.rand(rnd_count) rnd = rnd.astype(int) counter = 0 cur_try = 0 while counter < len(rnd) and cur_try < s_count: cur_try += 1 rn = rnd[counter] s = self.triangulation.simplices[rn] ri = random.randint(0, 2) i = s[ri] j = s[ri + 1] if ri < 2 else s[0] if not self.is_connected(self.lookup[i], self.lookup[j]): self.add_connection(self.lookup[i], self.lookup[j]) counter += 1
def serialize(pair): key, val = pair ensure(isinstance(key, basestring), "key must be a string") ensure('|' not in key, "key must not contain a pipe") key = key.strip() ensure(key, "key must not be empty") return '|'.join([key, json.dumps(val)])
def test(): # test case, arr1 = [2, 3, 5, 4, 3, 2, 6, 7] arr2 = [5, 4, 3, 2, 1, 0] arr3 = [1, 2, 3, 4, 3] arr4 = [] result1 = duplicate(arr1) ensure(result1[0] == True and result1[1] in (2, 3), 'arr1') result2 = duplicate(arr2) ensure(result2[0] == False, 'arr2') result3 = duplicate(arr3) ensure(result3[0] == True and result3[1] in (3, ), 'arr3') result4 = duplicate(arr4) ensure(result4[0] == False, 'arr4')
def mkreq(path, **overrides): ensure(not path.startswith('http://'), "no insecure requests, please") # TODO: consider removing these two lines if path.lstrip('/').startswith('article-xml/articles/'): path = os.path.abspath(path) path = 'file://' + path if not path.startswith('https://') else path msid, ver = utils.version_from_path(path) request = { 'action': conf.INGEST, 'location': path, 'id': msid, 'version': ver, 'force': True, # TODO: shouldn't this be False? 'validate-only': False, 'token': 'pants-party' } request.update(overrides) # don't ever generate an invalid request utils.validate(request, conf.REQUEST_SCHEMA) return request
def test_greater_than(): l1 = ['>', 2, 1] l2 = ['>', 1, 2] l3 = ['>', 1, ['+', 1, 1]] ensure(Apply().greater_than(l1), 'more 测试1') ensure(not Apply().greater_than(l2), 'more 测试2') ensure(not Apply().greater_than(l3), 'more 测试3')
def test_apply(): l1 = ['+', 1, 2, ['-', 2, 1]] l2 = ['if', ['>', 1, 2], 1, 2] l3 = ['if', ['<', 1, 2], 1, 2] l4 = ['if', ['=', 1, 2], 1, 2] # print(apply(l1)) # print(apply(l2)) # print(apply(l3)) # print(apply(l4)) ensure(Apply().apply(l1) == 4, 'apply 测试1') ensure(Apply().apply(l2) == 2, 'apply 测试2') ensure(Apply().apply(l3) == 1, 'apply 测试3') ensure(Apply().apply(l4) == 2, 'apply 测试4')
def test_less_than(): l1 = ['<', 2, 1] l2 = ['<', 1, 2] l3 = ['<', 1, ['+', 1, 1]] ensure(not Apply().less_than(l1), 'less 测试1') ensure(Apply().less_than(l2), 'less 测试2') ensure(Apply().less_than(l3), 'less 测试3')
def test_judge_cmp(): l1 = ['if', ['>', 3, 4], 1, 2] l2 = ['if', False, 2, ['+', 1, 2]] l3 = ['if', False, 2, ['if', True, 1, 2]] ensure(Apply().judge(l1) == 2, 'judge 测试1') ensure(Apply().judge(l2) == 3, 'judge 测试2') ensure(Apply().judge(l3) == 1, 'judge 测试3')
def test_equal(): l1 = ['=', 2, 1] l2 = ['=', 2, 2] l3 = ['=', 2, ['+', 1, 1]] ensure(not Apply().equal(l1), 'equal 测试1') ensure(Apply().equal(l2), 'equal 测试2') ensure(Apply().equal(l3), 'equal 测试3')
def _subdesc(desc, path): """a path looks like: <type>.<target> for example: `file./opt/thing/` or `mysql-database.mydb1""" bits = path.split('.', 1) ensure(len(bits) == 2, "expecting just two bits, got %s bits: %s" % (len(bits), path), ValueError) toplevel, target = bits ensure(toplevel in desc, "descriptor has no %r key" % toplevel, ValueError) ensure(target in desc[toplevel], "given descriptor has no path %r" % path, ValueError) return {toplevel: [target]}
def test(): # test case, # 此方法有缺陷,不能找出所有的重复的数字, # 因为在1~2的范围里有1和2两个数字,这个范围的数字也出现两次, # 不能确定是每个数字各出现一次还是某个数字出现了两次 arr1 = [2, 3, 5, 4, 3, 2, 6, 7] arr2 = [5, 4, 3, 2, 1, 0] arr3 = [1, 2, 3, 4, 4] arr4 = [] result1 = duplicate(arr1) ensure(result1[0] == True and result1[1] in (2, 3), 'arr1') result2 = duplicate(arr2) ensure(result2[0] == False, 'arr2') result3 = duplicate(arr3) ensure(result3[0] == True and result3[1] in (4, ), 'arr3') result4 = duplicate(arr4) ensure(result4[0] == False, 'arr4')
def make_inf_list(params,steps, make_string): l=[make_string(day,steps) for day in params['days']] return l # specify params for synth data # params are also needed to construct inference prog / ripl params_name = 'easy_d4_s33_bi4_be10' #featurefunctions__maxDay_size_num_birds_softmaxbeta params = get_params(params_name, 'onebird') gtruth_params = params.copy() infer_params = params.copy() gtruth_params['name'] = 'gtruth' infer_params['name'] = 'infer' ensure(params_name) directory = params_name +'/' generate_save_synthetic_data(gtruth_params,directory) ## specifying an inference prog: # need list of inference strings or single one (filter/batch) # for list, need num_days from params exp_seed1=dict( type = 'seq_block_mh', steps = 50, make_inf_string = seq_block_mh_make_inf_string) exp_seed2 = exp_seed1.copy() exp_seed3=dict( type = 'seq_block_pgibbs', steps = 50,
def find_lax(): ensure(os.path.exists(conf.PATH_TO_LAX), "could not find lax") script = join(conf.PATH_TO_LAX, "manage.sh") ensure(os.path.exists(script), "could not find lax's manage.sh script") return script
def is_connected(self, from_room, to_room): ensure(from_room >= 0 and from_room < len(self.nodes), 'from_room out of bounds') ensure(to_room >= 0 and to_room < len(self.nodes), 'to_room out of bounds') return self.connections[from_room, to_room] != 0
except BaseException as err: strbuffer.write("failed (%s)" % err) finally: log = conf.multiprocess_log('generation.log', __name__) log.info(strbuffer.getvalue()) def main(xml_dir, json_output_dir, num=None): paths = lmap(lambda fname: join(xml_dir, fname), os.listdir(xml_dir)) paths = lfilter(lambda path: path.lower().endswith('.xml'), paths) paths = sorted(paths, reverse=True) if num: paths = paths[:num] # only scrape first n articles num_processes = -1 Parallel(n_jobs=num_processes)(delayed(render)(path, json_output_dir) for path in paths) print('see scrape.log for errors') if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('xml-dir', nargs='?', default=conf.XML_DIR) parser.add_argument('output-dir', nargs='?', default=conf.JSON_DIR) parser.add_argument('--num', type=int, nargs='?') args = vars(parser.parse_args()) indir, outdir = [os.path.abspath(args[key]) for key in ['xml-dir', 'output-dir']] ensure(os.path.exists(indir), "the path %r doesn't exist" % indir) ensure(os.path.exists(outdir), "the path %r doesn't exist" % outdir) main(indir, outdir, args['num'])
def test_11(): ensure(applyCompare(['>', 2, 1]) == True, 'test_11 >')
def wrap(v): ensure(v, "%s cannot be blank/empty/None" % label) return v
def test_12(): arr = ['+', 1, 2, 3] ensure(apply_ops(['>', 2, 1]) == True, 'test_12 >') ensure(apply_ops(arr) == 6, 'test_12 +')
def file_handler(path): ensure(path.startswith(PROJECT_DIR), "unsafe operation - refusing to read from a file location outside of project root. %r does not start with %r" % (path, PROJECT_DIR)) xml = open(path, 'r').read() # write cache? return xml
def test(): arr1 = [2, 3, 1, 0, 2, 5, 3] ensure((True, 2) == duplicate(arr1), 'arr1')
def validate_response(response): utils.validate(response, conf.RESPONSE_SCHEMA) max_size_in_bytes = 262144 response_size = len(serialise_response(response)) ensure(response_size <= max_size_in_bytes, "response size (%s) is too large: %s" % (response_size, max_size_in_bytes), ValidationError)