Beispiel #1
0
def all_revisions(revisions):
    # TODO: stats by editor (top %, 5+ edits), by date (last 30 days), length stats
    if revisions:
        first_edit_age = datetime.utcnow() - revisions[0]['rev_parsed_date']
        latest_age = datetime.utcnow() - revisions[-1]['rev_parsed_date']
        ret = {
            'all': set_info(revisions),
            '2_days': set_info(newer_than(2, revisions)),
            '30_days': set_info(newer_than(30, revisions)),
            '90_days': set_info(newer_than(90, revisions)),
            '365_days': set_info(newer_than(365, revisions)),
            'latest_date': str(revisions[-1]['rev_parsed_date'].isoformat()),
            'latest_age': latest_age.total_seconds(),
            'first_date': revisions[0]['rev_parsed_date'].isoformat(),
            'first_age': first_edit_age.total_seconds(),
            'interval': dist_stats(get_time_diffs(revisions))
        }
    else:
        ret = {
            'all': '',
            '2_days': '',
            '30_days': '',
            '90_days': '',
            '365_days': '',
            'latest_date': '',
            'latest_age': '',
            'first_date': '',
            'first_age': '',
            'interval': '',
        }
    return ret
Beispiel #2
0
def section_stats(headers):
    hs = [h for h in headers if get_text(h) != 'Contents']
    # how not to write Python: ['h'+str(i) for i in range(1, 8)]
    all_headers = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']
    totals = []
    for header in hs:
        if header.getnext() is not None:  # TODO: the next item after an h1 is #bodyContents div
            pos = header.getnext()
            text = ''
            while pos.tag not in all_headers:
                text += ' ' + get_text(pos)  # TODO: the references section may skew the no. words under an h2
                if pos.getnext() is not None:
                    pos = pos.getnext()
                else:
                    break
            totals.append((get_text(header).replace('[edit] ', ''), len(text.split())))
    dists = {}
    dists['header'] = dist_stats([len(header.split()) for header, t in totals])
    dists['text'] = dist_stats([text for h, text in totals])
    return dists
Beispiel #3
0
def set_info(revisions):
    editor_counts = get_editor_counts(revisions)
    sorted_editor_counts = sorted(editor_counts.iteritems(), key=lambda (k, v): v, reverse=True)
    sorted_editor_bytes = sorted(get_editor_bytes(revisions).iteritems(), key=lambda (k, v): v, reverse=True)
    abs_byte_sum = sum([abs(x['rev_diff']) for x in revisions])

    return {
        'count': len(revisions),
        'minor_count': int(sum([rev['rev_minor_edit'] for rev in revisions])),
        'byte_count': sum([rev['rev_diff'] for rev in revisions]),
        'by_day': dist_stats(edits_by_day(revisions)),
        'ip_edit_count':  len([rev for rev in revisions if rev['rev_user'] == 0]),
        'est_revert_count':  len([rev for rev in revisions if 'revert' in rev['rev_comment'].lower()]),
        'blank_count': len([x for x in revisions if x['rev_len'] == 0]),
        'deleted_count': len([x for x in revisions if x['rev_deleted'] > 0]),
        'abs_byte': dist_stats([abs(rev['rev_diff']) for rev in revisions]) if revisions else {},
        'ed_returning': len([c for c in editor_counts.itervalues() if c > RETURNING_ED_THRESHOLD]),
        'ed_unique': len(editor_counts),
        'ed_top_20': get_top_percent_editors(.20, sorted_editor_counts, len(revisions)),
        'ed_top_5': get_top_percent_editors(.05, sorted_editor_counts, len(revisions)),
        'ed_top_20_bytes': get_top_percent_editors(.20, sorted_editor_bytes, abs_byte_sum),
        'ed_top_5_bytes': get_top_percent_editors(.05, sorted_editor_bytes, abs_byte_sum)
        }
dom_data = cast_table(in_data, attr_selector='d_')
dom_stats = Orange.statistics.basic.Domain(dom_data)
new_attrs = []
for attr in dom_data.domain.features:
    attr_c = Orange.feature.Continuous(attr.name + "_n")
    attr_c.getValueFrom = Orange.classification.ClassifierFromVar(whichVar=attr)
    transformer = Orange.data.utils.NormalizeContinuous()
    attr_c.getValueFrom.transformer = transformer
    transformer.average = dom_stats[attr].avg
    transformer.span = dom_stats[attr].dev
    new_attrs.append(attr_c)

new_domain = Orange.data.Domain(new_attrs, dom_data.domain.classVar)
norm_dom_data = Orange.data.Table(new_domain, dom_data)

fa_res = fa_node.execute(norm_dom_data.to_numpy()[0])
out_data = Table(fa_node.A)

from stats import dist_stats
in_domain = norm_dom_data.domain
LATENT_COUNT = min(len(in_domain.attributes)/2, len(fa_node.A))
latent_attrs = []
weights = fa_node.A.transpose()
for i in range(LATENT_COUNT):
    cur_weights = weights[i]
    abs_stats = dist_stats([abs(x) for x in cur_weights])
    median = abs_stats['mean']
    dev_cutoff = abs_stats['std_dev']
    latent_attrs.append([(in_domain[i], x) for i, x in enumerate(cur_weights) if abs(x) > median+dev_cutoff])
    
#sorted([(a,b) for a,b in zip(in_domain.features, fa_node.sigma)], key=lambda x: x[1], reverse=True)
Beispiel #5
0
def element_words_dist(elem):
    return lambda f: dist_stats([len(get_text(navbox).split()) for navbox in f(elem)])