Esempio n. 1
0
def get_s3_file_tree(s3, bucket, prefix):
    """Overcome s3 response limit and return NestedDict tree of paths.

    The NestedDict object also allows the user to search by the ends of a path.

    The tree mimics a file directory structure, with the leave nodes being the
    full unbroken key. For example, 'path/to/file.txt' would be retrieved by

        ret['path']['to']['file.txt']['key']

    The NestedDict object returned also has the capability to get paths that
    lead to a certain value. So if you wanted all paths that lead to something
    called 'file.txt', you could use

        ret.get_paths('file.txt')

    For more details, see the NestedDict docs.
    """
    file_tree = NestedDict()
    pref_path = prefix.split('/')[:-1]  # avoid the trailing empty str.
    for key in iter_s3_keys(s3, bucket, prefix):
        full_path = key.split('/')
        relevant_path = full_path[len(pref_path):]
        curr = file_tree
        for step in relevant_path:
            curr = curr[step]
        curr['key'] = key
    return file_tree
Esempio n. 2
0
 def __init__(self):
     self.d = NestedDict()
     self.stmts = []
     self.target_sets = []
     self.bettered_sids = set()
     self.links = set()
     return
Esempio n. 3
0
File: aws.py Progetto: steppi/indra
def get_s3_file_tree(s3, bucket, prefix, date_cutoff=None, after=True,
                     with_dt=False):
    """Overcome s3 response limit and return NestedDict tree of paths.

    The NestedDict object also allows the user to search by the ends of a path.

    The tree mimics a file directory structure, with the leave nodes being the
    full unbroken key. For example, 'path/to/file.txt' would be retrieved by

        ret['path']['to']['file.txt']['key']

    The NestedDict object returned also has the capability to get paths that
    lead to a certain value. So if you wanted all paths that lead to something
    called 'file.txt', you could use

        ret.get_paths('file.txt')

    For more details, see the NestedDict docs.

    Parameters
    ----------
    s3 : boto3.client.S3
        A boto3.client.S3 instance
    bucket : str
        The name of the bucket to list objects in
    prefix : str
        The prefix filtering of the objects for list
    date_cutoff : str|datetime.datetime
        A datestring of format %Y(-%m-%d-%H-%M-%S) or a datetime.datetime
        object. The date is assumed to be in UTC. By default no filtering
        is done. Default: None.
    after : bool
        If True, only return objects after the given date cutoff.
        Otherwise, return objects before. Default: True
    with_dt : bool
        If True, yield a tuple (key, datetime.datetime(LastModified)) of
        the s3 Key and the object's LastModified date as a
        datetime.datetime object, only yield s3 key otherwise.
        Default: False.

    Returns
    -------
    NestedDict
        A file tree represented as an NestedDict
    """
    file_tree = NestedDict()
    pref_path = prefix.split('/')[:-1]   # avoid the trailing empty str.
    for k in iter_s3_keys(s3, bucket, prefix, date_cutoff, after, with_dt):
        if with_dt:
            key, dt = k
        else:
            key, dt = k, None
        full_path = key.split('/')
        relevant_path = full_path[len(pref_path):]
        curr = file_tree
        for step in relevant_path:
            curr = curr[step]
        curr['key'] = k
    return file_tree
Esempio n. 4
0
def _get_reading_statement_dict(db, clauses=None, get_full_stmts=True):
    """Get a nested dict of statements, keyed by ref, content, and reading."""
    # Construct the query for metadata from the database.
    q = (db.session.query(db.TextRef, db.TextContent.id,
                          db.TextContent.source, db.Reading.id,
                          db.Reading.reader_version, db.RawStatements.id,
                          db.RawStatements.json)
         .filter(db.RawStatements.reading_id == db.Reading.id,
                 db.Reading.text_content_id == db.TextContent.id,
                 db.TextContent.text_ref_id == db.TextRef.id))
    if clauses:
        q = q.filter(*clauses)

    # Prime some counters.
    num_duplicate_evidence = 0
    num_unique_evidence = 0

    # Populate a dict with all the data.
    stmt_nd = NestedDict()
    for tr, tcid, src, rid, rv, sid, sjson in q.yield_per(1000):
        # Back out the reader name.
        for reader, rv_list in reader_versions.items():
            if rv in rv_list:
                break
        else:
            raise Exception("rv %s not recognized." % rv)

        # Get the json for comparison and/or storage
        stmt_json = json.loads(sjson.decode('utf8'))
        stmt = Statement._from_json(stmt_json)
        _set_evidence_text_ref(stmt, tr)

        # Hash the compbined stmt and evidence matches key.
        stmt_hash = stmt.get_hash(shallow=False)

        # For convenience get the endpoint statement dict
        s_dict = stmt_nd[tr.id][src][tcid][reader][rv][rid]

        # Initialize the value to a set, and count duplicates
        if stmt_hash not in s_dict.keys():
            s_dict[stmt_hash] = set()
            num_unique_evidence += 1
        else:
            num_duplicate_evidence += 1

        # Either store the statement, or the statement id.
        if get_full_stmts:
            s_dict[stmt_hash].add((sid, stmt))
        else:
            s_dict[stmt_hash].add((sid, None))

    # Report on the results.
    print("Found %d relevant text refs with statements." % len(stmt_nd))
    print("number of statement exact duplicates: %d" % num_duplicate_evidence)
    print("number of unique statements: %d" % num_unique_evidence)
    return stmt_nd
Esempio n. 5
0
def test_nested_dict():
    d = NestedDict()
    print(d)
    d['A']['B']['C'] = 3
    d['B']['C'] = {'D': 2}
    print(d)
    assert d['A']['B']['C'] == 3
    assert d.get('A') == d['A']
    assert d.gets('A') == [d['A']]
    assert d.get('C') in [3, {'D': 2}]
    assert d.get('D') == 2
    assert d.get_path('C') in [(('A', 'B', 'C'), 3), (('B', 'C'), {'D': 2})]
    assert_contents_equal([str(v) for v in d.gets('C')],
                          ['3', str(d['B']['C'])])
    d.export_dict()  # Should probably test for matching contents
    assert_contents_equal(
        [str(v) for v in d.get_paths('C')],
        [str((('A', 'B', 'C'), 3)),
         str((('B', 'C'), d['B']['C']))])
Esempio n. 6
0
 def _get_results_file_tree(self, s3, s3_prefix):
     relevant_files = s3.list_objects(Bucket=bucket_name, Prefix=s3_prefix)
     file_tree = NestedDict()
     file_keys = [entry['Key'] for entry in relevant_files['Contents']]
     pref_path = s3_prefix.split('/')[:-1]  # avoid the trailing empty str.
     for key in file_keys:
         full_path = key.split('/')
         relevant_path = full_path[len(pref_path):]
         curr = file_tree
         for step in relevant_path:
             curr = curr[step]
         curr['key'] = key
     return file_tree
Esempio n. 7
0
def get_s3_file_tree(s3, bucket, prefix):
    """Overcome s3 response limit and return NestedDict tree of paths.

    The NestedDict object also allows the user to search by the ends of a path.

    The tree mimics a file directory structure, with the leave nodes being the
    full unbroken key. For example, 'path/to/file.txt' would be retrieved by

        ret['path']['to']['file.txt']['key']

    The NestedDict object returned also has the capability to get paths that
    lead to a certain value. So if you wanted all paths that lead to something
    called 'file.txt', you could use

        ret.get_paths('file.txt')

    For more details, see the NestedDict docs.
    """
    def get_some_keys(keys, marker=None):
        if marker:
            relevant_files = s3.list_objects(Bucket=bucket,
                                             Prefix=prefix,
                                             Marker=marker)
        else:
            relevant_files = s3.list_objects(Bucket=bucket, Prefix=prefix)
        keys.extend([
            entry['Key'] for entry in relevant_files['Contents']
            if entry['Key'] != marker
        ])
        return relevant_files['IsTruncated']

    file_keys = []
    marker = None
    while get_some_keys(file_keys, marker):
        marker = file_keys[-1]

    file_tree = NestedDict()
    pref_path = prefix.split('/')[:-1]  # avoid the trailing empty str.
    for key in file_keys:
        full_path = key.split('/')
        relevant_path = full_path[len(pref_path):]
        curr = file_tree
        for step in relevant_path:
            curr = curr[step]
        curr['key'] = key
    return file_tree
Esempio n. 8
0
    def _report_timing(self, timing_info):
        # Pivot the timing info.
        idx_patt = re.compile('%s_(\d+)_(\d+)' % self.basename)
        job_segs = NestedDict()
        plot_set = set()
        for stage, stage_d in timing_info.items():
            # e.g. reading, statement production...
            for metric, metric_d in stage_d.items():
                # e.g. start, end, ...
                for job_name, t in metric_d.items():
                    # e.g. job_basename_startIx_endIx
                    job_segs[job_name][stage][metric] = t
                    m = idx_patt.match(job_name)
                    if m is None:
                        logger.error("Unexpectedly formatted name: %s." %
                                     job_name)
                        continue
                    key = tuple([int(n) for n in m.groups()] + [job_name])
                    plot_set.add(key)
        plot_list = list(plot_set)
        plot_list.sort()

        # Use this for getting the minimum and maximum.
        all_times = [
            dt for job in job_segs.values() for stage in job.values()
            for metric, dt in stage.items() if metric != 'duration'
        ]
        all_start = min(all_times)
        all_end = max(all_times)

        def get_time_tuple(stage_data):
            start_seconds = (stage_data['start'] - all_start).total_seconds()
            return start_seconds, stage_data['duration'].total_seconds()

        # Make the broken barh plots.
        w = 6.5
        h = 9
        fig = plt.figure(figsize=(w, h))
        gs = plt.GridSpec(2, 1, height_ratios=[10, 1])
        ax0 = plt.subplot(gs[0])
        ytick_pairs = []
        stages = ['reading', 'statement production', 'stats']
        t = arange((all_end - all_start).total_seconds())
        counts = dict.fromkeys(['jobs'] + stages)
        for k in counts.keys():
            counts[k] = array([0 for _ in t])
        for i, job_tpl in enumerate(plot_list):
            s_ix, e_ix, job_name = job_tpl
            job_d = job_segs[job_name]
            xs = [get_time_tuple(job_d[stg]) for stg in stages]
            ys = (s_ix, (e_ix - s_ix) * 0.9)
            ytick_pairs.append(((s_ix + e_ix) / 2, '%s_%s' % (s_ix, e_ix)))
            logger.debug("Making plot for: %s" % str((job_name, xs, ys)))
            ax0.broken_barh(xs, ys, facecolors=('red', 'green', 'blue'))

            for n, stg in enumerate(stages):
                cs = counts[stg]
                start = xs[n][0]
                dur = xs[n][1]
                cs[(t > start) & (t < (start + dur))] += 1
            cs = counts['jobs']
            cs[(t > xs[0][0]) & (t < (xs[-1][0] + xs[-1][1]))] += 1

        # Format the plot
        ax0.tick_params(top='off',
                        left='off',
                        right='off',
                        bottom='off',
                        labelleft='on',
                        labelbottom='off')
        for spine in ax0.spines.values():
            spine.set_visible(False)
        total_time = (all_end - all_start).total_seconds()
        ax0.set_xlim(0, total_time)
        ax0.set_ylabel(self.basename + '_ ...')
        print(ytick_pairs)
        yticks, ylabels = zip(*ytick_pairs)
        print(yticks)
        if not self.ids_per_job:
            print([yticks[i + 1] - yticks[i] for i in range(len(yticks) - 1)])
            # Infer if we don't have it.
            spacing = median(
                [yticks[i + 1] - yticks[i] for i in range(len(yticks) - 1)])
            spacing = max(1, spacing)
        else:
            spacing = self.ids_per_job
        print(spacing)
        print(yticks[0], yticks[-1])
        ytick_range = list(arange(yticks[0], yticks[-1] + spacing, spacing))
        ylabel_filled = []
        for ytick in ytick_range:
            if ytick in yticks:
                ylabel_filled.append(ylabels[yticks.index(ytick)])
            else:
                ylabel_filled.append('FAILED')
        ax0.set_ylim(0, max(ytick_range) + spacing)
        ax0.set_yticks(ytick_range)
        ax0.set_yticklabels(ylabel_filled)

        # Plot the lower axis.
        legend_list = []
        color_map = {
            'jobs': 'k',
            'reading': 'r',
            'statement production': 'g',
            'stats': 'b'
        }
        ax1 = plt.subplot(gs[1], sharex=ax0)
        for k, cs in counts.items():
            legend_list.append(k)
            ax1.plot(t, cs, color=color_map[k])
        for lbl, spine in ax1.spines.items():
            spine.set_visible(False)
        max_n = max(counts['jobs'])
        ax1.set_ylim(0, max_n + 1)
        ax1.set_xlim(0, total_time)
        yticks = list(range(0, max_n - max_n // 5, max(1, max_n // 5)))
        ax1.set_yticks(yticks + [max_n])
        ax1.set_yticklabels([str(n) for n in yticks] + ['max=%d' % max_n])
        ax1.set_ylabel('N_jobs')
        ax1.set_xlabel('Time since beginning [seconds]')

        # Make the figure borders more sensible.
        fig.tight_layout()
        img_path = 'time_figure.png'
        fig.savefig(img_path)
        self.reporter.add_image(img_path, width=w, height=h, section='Plots')
        return
Esempio n. 9
0
def make_raw_statement_set_for_distillation():
    d = NestedDict()
    stmts = []
    target_sets = []
    bettered_sids = set()

    # Create a function which will update all possible outcome scenarios given a
    # set of some_stmts.
    def add_stmts_to_target_set(some_stmts):
        # If we don't have any target sets of statements, initialize with the
        # input statements.
        if not target_sets:
            for stmt in some_stmts:
                target_sets.append(({stmt},
                                    {stmts.index(s) for s in some_stmts
                                     if s is not stmt}))
        else:
            # Make a copy and empty the current list.
            old_target_sets = target_sets[:]
            try: # Python 3
                target_sets.clear()
            except AttributeError: # Python 2
                del target_sets[:]

            # Now for every previous scenario, pick a random possible "good"
            # statement, update the corresponding duplicate trace.
            for stmt_set, dup_set in old_target_sets:
                for stmt in some_stmts:
                    # Here we consider the possibility that each of the
                    # potential valid statements may be chosen, and record that
                    # possible alteration to the set of possible histories.
                    new_set = stmt_set.copy()
                    new_set.add(stmt)
                    new_dups = dup_set.copy()
                    new_dups |= {stmts.index(s) for s in some_stmts
                                 if s is not stmt}
                    target_sets.append((new_set, new_dups))
        return target_sets

    # Create a function to handle the creation of the metadata.
    def add_content(trid, src, tcid, reader, rv_idx, rid, a, b, ev_num, copies,
                    is_target=False):
        # Add the new statements to the over-all list.
        stmts.extend(__make_test_statements(a, b, reader, ev_num, copies))

        # If we are making multiple copies, the latest copies should have the
        # same overall hash. If it's not a copy, the hashes should be different.
        if copies > 1:
            # The above only applies if the evidence was specified to be the
            # same, otherwise it assumed the evidence, and therefore the hash,
            # is different.
            last_hash = stmts[-1].get_hash(shallow=False)
            sec_last_hash = stmts[-2].get_hash(shallow=False)
            if ev_num is not None:
                assert last_hash == sec_last_hash
            else:
                assert last_hash != sec_last_hash

        # Populate the provenance for the dict.
        rv = db_util.reader_versions[reader][rv_idx]
        r_dict = d[trid][src][tcid][reader][rv][rid]

        # If the evidence variation was specified, the evidence in any copies is
        # identical, and they will all have the same hash. Else, the hash is
        # different and the statements need to be iterated over.
        if ev_num is not None:
            s_hash = stmts[-1].get_hash(shallow=False)

            # Check to see if we have a matching statement yet.
            if r_dict.get(s_hash) is None:
                r_dict[s_hash] = set()

            # Set the value
            last_hash = stmts[-1].get_hash(shallow=False)
            d[trid][src][tcid][reader][rv][rid][last_hash] |= \
                {(stmts.index(s), s) for s in stmts[-copies:]}
        else:
            for s in stmts[-copies:]:
                s_hash = s.get_hash(shallow=False)
                if r_dict.get(s_hash) is None:
                    r_dict[s_hash] = set()
                d[trid][src][tcid][reader][rv][rid][s_hash].add(
                    (stmts.index(s), s)
                    )

        # If this/these statement/s is intended to be picked up, add it/them to
        # the target sets.
        if is_target:
            global target_sets
            target_sets = add_stmts_to_target_set(stmts[-copies:])
        return

    # We produced statements a coupld of times with and old reader version
    #           trid         tcid        reader vrsn idx   distinct evidence id
    #           |  source    |  reader   |  reading id     |  number of copies
    #           |  |         |  |        |  |  Agents      |  |  Is it a target?
    add_content(1, 'pubmed', 1, 'reach', 0, 1, 'A1', 'B1', 1, 2, False)
    add_content(1, 'pubmed', 1, 'reach', 0, 1, 'A1', 'B1', 2, 1)
    add_content(1, 'pubmed', 1, 'reach', 0, 1, 'A2', 'B2', 1, 1)

    # Do it again for a new reader version.
    add_content(1, 'pubmed', 1, 'reach', 1, 2, 'A1', 'B1', 1, 2)
    add_content(1, 'pubmed', 1, 'reach', 1, 2, 'A1', 'B1', 2, 1)

    # Add some for sparser.
    add_content(1, 'pubmed', 1, 'sparser', 1, 3, 'A1', 'B1', 1, 2)
    add_content(1, 'pubmed', 1, 'sparser', 1, 3, 'A2', 'B2', 1, 1)

    # Now add statements from another source.
    add_content(1, 'pmc_oa', 2, 'reach', 0, 4, 'A1', 'B1', 1, 2)
    add_content(1, 'pmc_oa', 2, 'reach', 0, 4, 'A1', 'B1', 2, 1)
    add_content(1, 'pmc_oa', 2, 'reach', 0, 4, 'A2', 'B2', 1, 1)

    # All the statements up until now will be skipped, if all goes well.
    bettered_sids |= set(range(len(stmts)))

    # ...and again for a new reader version.
    add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A1', 'B1', 1, 2, True)
    add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A1', 'B1', 2, 1, True)
    add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A2', 'B2', 1, 1, True)
    add_content(1, 'pmc_oa', 2, 'reach', 1, 4, 'A3', 'B3', 1, 1, True)

    # Add some results from sparser
    add_content(1, 'pmc_oa', 2, 'sparser', 1, 5, 'A1', 'B1', 1, 2, True)
    add_content(1, 'pmc_oa', 2, 'sparser', 1, 5, 'A2', 'B2', 1, 1, True)

    # Add some content for another text ref.
    add_content(2, 'pmc_oa', 3, 'sparser', 1, 6, 'A3', 'B3', 1, 1, True)
    add_content(2, 'manuscripts', 4, 'sparser', 1, 7, 'A3', 'B3', 1, 1)

    # This last statement should also be skipped, if all goes well.
    bettered_sids.add(len(stmts) - 1)

    return d, stmts, target_sets, bettered_sids