def get_s3_file_tree(s3, bucket, prefix): """Overcome s3 response limit and return NestedDict tree of paths. The NestedDict object also allows the user to search by the ends of a path. The tree mimics a file directory structure, with the leave nodes being the full unbroken key. For example, 'path/to/file.txt' would be retrieved by ret['path']['to']['file.txt']['key'] The NestedDict object returned also has the capability to get paths that lead to a certain value. So if you wanted all paths that lead to something called 'file.txt', you could use ret.get_paths('file.txt') For more details, see the NestedDict docs. """ file_tree = NestedDict() pref_path = prefix.split('/')[:-1] # avoid the trailing empty str. for key in iter_s3_keys(s3, bucket, prefix): full_path = key.split('/') relevant_path = full_path[len(pref_path):] curr = file_tree for step in relevant_path: curr = curr[step] curr['key'] = key return file_tree
def __init__(self): self.d = NestedDict() self.stmts = [] self.target_sets = [] self.bettered_sids = set() self.links = set() return
def get_s3_file_tree(s3, bucket, prefix, date_cutoff=None, after=True, with_dt=False): """Overcome s3 response limit and return NestedDict tree of paths. The NestedDict object also allows the user to search by the ends of a path. The tree mimics a file directory structure, with the leave nodes being the full unbroken key. For example, 'path/to/file.txt' would be retrieved by ret['path']['to']['file.txt']['key'] The NestedDict object returned also has the capability to get paths that lead to a certain value. So if you wanted all paths that lead to something called 'file.txt', you could use ret.get_paths('file.txt') For more details, see the NestedDict docs. Parameters ---------- s3 : boto3.client.S3 A boto3.client.S3 instance bucket : str The name of the bucket to list objects in prefix : str The prefix filtering of the objects for list date_cutoff : str|datetime.datetime A datestring of format %Y(-%m-%d-%H-%M-%S) or a datetime.datetime object. The date is assumed to be in UTC. By default no filtering is done. Default: None. after : bool If True, only return objects after the given date cutoff. Otherwise, return objects before. Default: True with_dt : bool If True, yield a tuple (key, datetime.datetime(LastModified)) of the s3 Key and the object's LastModified date as a datetime.datetime object, only yield s3 key otherwise. Default: False. Returns ------- NestedDict A file tree represented as an NestedDict """ file_tree = NestedDict() pref_path = prefix.split('/')[:-1] # avoid the trailing empty str. for k in iter_s3_keys(s3, bucket, prefix, date_cutoff, after, with_dt): if with_dt: key, dt = k else: key, dt = k, None full_path = key.split('/') relevant_path = full_path[len(pref_path):] curr = file_tree for step in relevant_path: curr = curr[step] curr['key'] = k return file_tree
def _get_reading_statement_dict(db, clauses=None, get_full_stmts=True): """Get a nested dict of statements, keyed by ref, content, and reading.""" # Construct the query for metadata from the database. q = (db.session.query(db.TextRef, db.TextContent.id, db.TextContent.source, db.Reading.id, db.Reading.reader_version, db.RawStatements.id, db.RawStatements.json) .filter(db.RawStatements.reading_id == db.Reading.id, db.Reading.text_content_id == db.TextContent.id, db.TextContent.text_ref_id == db.TextRef.id)) if clauses: q = q.filter(*clauses) # Prime some counters. num_duplicate_evidence = 0 num_unique_evidence = 0 # Populate a dict with all the data. stmt_nd = NestedDict() for tr, tcid, src, rid, rv, sid, sjson in q.yield_per(1000): # Back out the reader name. for reader, rv_list in reader_versions.items(): if rv in rv_list: break else: raise Exception("rv %s not recognized." % rv) # Get the json for comparison and/or storage stmt_json = json.loads(sjson.decode('utf8')) stmt = Statement._from_json(stmt_json) _set_evidence_text_ref(stmt, tr) # Hash the compbined stmt and evidence matches key. stmt_hash = stmt.get_hash(shallow=False) # For convenience get the endpoint statement dict s_dict = stmt_nd[tr.id][src][tcid][reader][rv][rid] # Initialize the value to a set, and count duplicates if stmt_hash not in s_dict.keys(): s_dict[stmt_hash] = set() num_unique_evidence += 1 else: num_duplicate_evidence += 1 # Either store the statement, or the statement id. if get_full_stmts: s_dict[stmt_hash].add((sid, stmt)) else: s_dict[stmt_hash].add((sid, None)) # Report on the results. print("Found %d relevant text refs with statements." % len(stmt_nd)) print("number of statement exact duplicates: %d" % num_duplicate_evidence) print("number of unique statements: %d" % num_unique_evidence) return stmt_nd
def test_nested_dict(): d = NestedDict() print(d) d['A']['B']['C'] = 3 d['B']['C'] = {'D': 2} print(d) assert d['A']['B']['C'] == 3 assert d.get('A') == d['A'] assert d.gets('A') == [d['A']] assert d.get('C') in [3, {'D': 2}] assert d.get('D') == 2 assert d.get_path('C') in [(('A', 'B', 'C'), 3), (('B', 'C'), {'D': 2})] assert_contents_equal([str(v) for v in d.gets('C')], ['3', str(d['B']['C'])]) d.export_dict() # Should probably test for matching contents assert_contents_equal( [str(v) for v in d.get_paths('C')], [str((('A', 'B', 'C'), 3)), str((('B', 'C'), d['B']['C']))])
def _get_results_file_tree(self, s3, s3_prefix): relevant_files = s3.list_objects(Bucket=bucket_name, Prefix=s3_prefix) file_tree = NestedDict() file_keys = [entry['Key'] for entry in relevant_files['Contents']] pref_path = s3_prefix.split('/')[:-1] # avoid the trailing empty str. for key in file_keys: full_path = key.split('/') relevant_path = full_path[len(pref_path):] curr = file_tree for step in relevant_path: curr = curr[step] curr['key'] = key return file_tree
def get_s3_file_tree(s3, bucket, prefix): """Overcome s3 response limit and return NestedDict tree of paths. The NestedDict object also allows the user to search by the ends of a path. The tree mimics a file directory structure, with the leave nodes being the full unbroken key. For example, 'path/to/file.txt' would be retrieved by ret['path']['to']['file.txt']['key'] The NestedDict object returned also has the capability to get paths that lead to a certain value. So if you wanted all paths that lead to something called 'file.txt', you could use ret.get_paths('file.txt') For more details, see the NestedDict docs. """ def get_some_keys(keys, marker=None): if marker: relevant_files = s3.list_objects(Bucket=bucket, Prefix=prefix, Marker=marker) else: relevant_files = s3.list_objects(Bucket=bucket, Prefix=prefix) keys.extend([ entry['Key'] for entry in relevant_files['Contents'] if entry['Key'] != marker ]) return relevant_files['IsTruncated'] file_keys = [] marker = None while get_some_keys(file_keys, marker): marker = file_keys[-1] file_tree = NestedDict() pref_path = prefix.split('/')[:-1] # avoid the trailing empty str. for key in file_keys: full_path = key.split('/') relevant_path = full_path[len(pref_path):] curr = file_tree for step in relevant_path: curr = curr[step] curr['key'] = key return file_tree
def _report_timing(self, timing_info): # Pivot the timing info. idx_patt = re.compile('%s_(\d+)_(\d+)' % self.basename) job_segs = NestedDict() plot_set = set() for stage, stage_d in timing_info.items(): # e.g. reading, statement production... for metric, metric_d in stage_d.items(): # e.g. start, end, ... for job_name, t in metric_d.items(): # e.g. job_basename_startIx_endIx job_segs[job_name][stage][metric] = t m = idx_patt.match(job_name) if m is None: logger.error("Unexpectedly formatted name: %s." % job_name) continue key = tuple([int(n) for n in m.groups()] + [job_name]) plot_set.add(key) plot_list = list(plot_set) plot_list.sort() # Use this for getting the minimum and maximum. all_times = [ dt for job in job_segs.values() for stage in job.values() for metric, dt in stage.items() if metric != 'duration' ] all_start = min(all_times) all_end = max(all_times) def get_time_tuple(stage_data): start_seconds = (stage_data['start'] - all_start).total_seconds() return start_seconds, stage_data['duration'].total_seconds() # Make the broken barh plots. w = 6.5 h = 9 fig = plt.figure(figsize=(w, h)) gs = plt.GridSpec(2, 1, height_ratios=[10, 1]) ax0 = plt.subplot(gs[0]) ytick_pairs = [] stages = ['reading', 'statement production', 'stats'] t = arange((all_end - all_start).total_seconds()) counts = dict.fromkeys(['jobs'] + stages) for k in counts.keys(): counts[k] = array([0 for _ in t]) for i, job_tpl in enumerate(plot_list): s_ix, e_ix, job_name = job_tpl job_d = job_segs[job_name] xs = [get_time_tuple(job_d[stg]) for stg in stages] ys = (s_ix, (e_ix - s_ix) * 0.9) ytick_pairs.append(((s_ix + e_ix) / 2, '%s_%s' % (s_ix, e_ix))) logger.debug("Making plot for: %s" % str((job_name, xs, ys))) ax0.broken_barh(xs, ys, facecolors=('red', 'green', 'blue')) for n, stg in enumerate(stages): cs = counts[stg] start = xs[n][0] dur = xs[n][1] cs[(t > start) & (t < (start + dur))] += 1 cs = counts['jobs'] cs[(t > xs[0][0]) & (t < (xs[-1][0] + xs[-1][1]))] += 1 # Format the plot ax0.tick_params(top='off', left='off', right='off', bottom='off', labelleft='on', labelbottom='off') for spine in ax0.spines.values(): spine.set_visible(False) total_time = (all_end - all_start).total_seconds() ax0.set_xlim(0, total_time) ax0.set_ylabel(self.basename + '_ ...') print(ytick_pairs) yticks, ylabels = zip(*ytick_pairs) print(yticks) if not self.ids_per_job: print([yticks[i + 1] - yticks[i] for i in range(len(yticks) - 1)]) # Infer if we don't have it. spacing = median( [yticks[i + 1] - yticks[i] for i in range(len(yticks) - 1)]) spacing = max(1, spacing) else: spacing = self.ids_per_job print(spacing) print(yticks[0], yticks[-1]) ytick_range = list(arange(yticks[0], yticks[-1] + spacing, spacing)) ylabel_filled = [] for ytick in ytick_range: if ytick in yticks: ylabel_filled.append(ylabels[yticks.index(ytick)]) else: ylabel_filled.append('FAILED') ax0.set_ylim(0, max(ytick_range) + spacing) ax0.set_yticks(ytick_range) ax0.set_yticklabels(ylabel_filled) # Plot the lower axis. legend_list = [] color_map = { 'jobs': 'k', 'reading': 'r', 'statement production': 'g', 'stats': 'b' } ax1 = plt.subplot(gs[1], sharex=ax0) for k, cs in counts.items(): legend_list.append(k) ax1.plot(t, cs, color=color_map[k]) for lbl, spine in ax1.spines.items(): spine.set_visible(False) max_n = max(counts['jobs']) ax1.set_ylim(0, max_n + 1) ax1.set_xlim(0, total_time) yticks = list(range(0, max_n - max_n // 5, max(1, max_n // 5))) ax1.set_yticks(yticks + [max_n]) ax1.set_yticklabels([str(n) for n in yticks] + ['max=%d' % max_n]) ax1.set_ylabel('N_jobs') ax1.set_xlabel('Time since beginning [seconds]') # Make the figure borders more sensible. fig.tight_layout() img_path = 'time_figure.png' fig.savefig(img_path) self.reporter.add_image(img_path, width=w, height=h, section='Plots') return
def make_raw_statement_set_for_distillation(): d = NestedDict() stmts = [] target_sets = [] bettered_sids = set() # Create a function which will update all possible outcome scenarios given a # set of some_stmts. def add_stmts_to_target_set(some_stmts): # If we don't have any target sets of statements, initialize with the # input statements. if not target_sets: for stmt in some_stmts: target_sets.append(({stmt}, {stmts.index(s) for s in some_stmts if s is not stmt})) else: # Make a copy and empty the current list. old_target_sets = target_sets[:] try: # Python 3 target_sets.clear() except AttributeError: # Python 2 del target_sets[:] # Now for every previous scenario, pick a random possible "good" # statement, update the corresponding duplicate trace. for stmt_set, dup_set in old_target_sets: for stmt in some_stmts: # Here we consider the possibility that each of the # potential valid statements may be chosen, and record that # possible alteration to the set of possible histories. new_set = stmt_set.copy() new_set.add(stmt) new_dups = dup_set.copy() new_dups |= {stmts.index(s) for s in some_stmts if s is not stmt} target_sets.append((new_set, new_dups)) return target_sets # Create a function to handle the creation of the metadata. def add_content(trid, src, tcid, reader, rv_idx, rid, a, b, ev_num, copies, is_target=False): # Add the new statements to the over-all list. stmts.extend(__make_test_statements(a, b, reader, ev_num, copies)) # If we are making multiple copies, the latest copies should have the # same overall hash. If it's not a copy, the hashes should be different. if copies > 1: # The above only applies if the evidence was specified to be the # same, otherwise it assumed the evidence, and therefore the hash, # is different. last_hash = stmts[-1].get_hash(shallow=False) sec_last_hash = stmts[-2].get_hash(shallow=False) if ev_num is not None: assert last_hash == sec_last_hash else: assert last_hash != sec_last_hash # Populate the provenance for the dict. rv = db_util.reader_versions[reader][rv_idx] r_dict = d[trid][src][tcid][reader][rv][rid] # If the evidence variation was specified, the evidence in any copies is # identical, and they will all have the same hash. Else, the hash is # different and the statements need to be iterated over. if ev_num is not None: s_hash = stmts[-1].get_hash(shallow=False) # Check to see if we have a matching statement yet. if r_dict.get(s_hash) is None: r_dict[s_hash] = set() # Set the value last_hash = stmts[-1].get_hash(shallow=False) d[trid][src][tcid][reader][rv][rid][last_hash] |= \ {(stmts.index(s), s) for s in stmts[-copies:]} else: for s in stmts[-copies:]: s_hash = s.get_hash(shallow=False) if r_dict.get(s_hash) is None: r_dict[s_hash] = set() d[trid][src][tcid][reader][rv][rid][s_hash].add( (stmts.index(s), s) ) # If this/these statement/s is intended to be picked up, add it/them to # the target sets. if is_target: global target_sets target_sets = add_stmts_to_target_set(stmts[-copies:]) return # We produced statements a coupld of times with and old reader version # trid tcid reader vrsn idx distinct evidence id # | source | reader | reading id | number of copies # | | | | | | Agents | | Is it a target? add_content(1, 'pubmed', 1, 'reach', 0, 1, 'A1', 'B1', 1, 2, False) add_content(1, 'pubmed', 1, 'reach', 0, 1, 'A1', 'B1', 2, 1) add_content(1, 'pubmed', 1, 'reach', 0, 1, 'A2', 'B2', 1, 1) # Do it again for a new reader version. add_content(1, 'pubmed', 1, 'reach', 1, 2, 'A1', 'B1', 1, 2) add_content(1, 'pubmed', 1, 'reach', 1, 2, 'A1', 'B1', 2, 1) # Add some for sparser. add_content(1, 'pubmed', 1, 'sparser', 1, 3, 'A1', 'B1', 1, 2) add_content(1, 'pubmed', 1, 'sparser', 1, 3, 'A2', 'B2', 1, 1) # Now add statements from another source. add_content(1, 'pmc_oa', 2, 'reach', 0, 4, 'A1', 'B1', 1, 2) add_content(1, 'pmc_oa', 2, 'reach', 0, 4, 'A1', 'B1', 2, 1) add_content(1, 'pmc_oa', 2, 'reach', 0, 4, 'A2', 'B2', 1, 1) # All the statements up until now will be skipped, if all goes well. bettered_sids |= set(range(len(stmts))) # ...and again for a new reader version. add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A1', 'B1', 1, 2, True) add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A1', 'B1', 2, 1, True) add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A2', 'B2', 1, 1, True) add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A3', 'B3', 1, 1, True) # Add some results from sparser add_content(1, 'pmc_oa', 2, 'sparser', 1, 5, 'A1', 'B1', 1, 2, True) add_content(1, 'pmc_oa', 2, 'sparser', 1, 5, 'A2', 'B2', 1, 1, True) # Add some content for another text ref. add_content(2, 'pmc_oa', 3, 'sparser', 1, 6, 'A3', 'B3', 1, 1, True) add_content(2, 'manuscripts', 4, 'sparser', 1, 7, 'A3', 'B3', 1, 1) # This last statement should also be skipped, if all goes well. bettered_sids.add(len(stmts) - 1) return d, stmts, target_sets, bettered_sids