Esempio n. 1
0
def partition_ids(old_true_positive,
                  old_true_negative,
                  classifier=event_classifier.classified_event_from_fb_event):
    success = set()
    fail = set()
    a = time.time()
    for i, (id, fb_event) in enumerate(processing.all_fb_data(combined_ids)):
        if not i % 10000: print 'Processing ', i
        if i < START_EVENT:
            continue
        if END_EVENT and i > END_EVENT:
            break
        result = classifier(fb_event)
        result.classify()
        if result.is_dance_event():
            success.add(id)
            if id in old_true_negative:
                handle_new_false_positive(id, result)
        else:
            # To print out failures, to see if there's any way we can better detect them
            #if id in good_ids:
            #    print id, fb_event['info'].get('name')
            #    print result.found_dance_matches, result.found_event_matches, result.found_wrong_matches
            fail.add(id)
            if id in old_true_positive:
                handle_new_false_negative(id, result)
    print 'Time per event: %s' % (1.0 * (time.time() - a) /
                                  (max(END_EVENT, i) - START_EVENT))
    return fail, success
def partition_ids(old_true_positive, old_true_negative, classifier=event_classifier.classified_event_from_fb_event):
    success = set()
    fail = set()
    a = time.time()
    for i, (id, fb_event) in enumerate(processing.all_fb_data(combined_ids)):
        if not i % 10000: print 'Processing ', i
        if i < START_EVENT:
            continue
        if END_EVENT and i > END_EVENT:
            break
        result = classifier(fb_event)
        result.classify()
        if result.is_dance_event():
            success.add(id)
            if id in old_true_negative:
                handle_new_false_positive(id, result)
        else:
            # To print out failures, to see if there's any way we can better detect them
            #if id in good_ids:
            #    print id, fb_event['info'].get('name')
            #    print result.found_dance_matches, result.found_event_matches, result.found_wrong_matches
            fail.add(id)
            if id in old_true_positive:
                handle_new_false_negative(id, result)
    print 'Time per event: %s' % (1.0 * (time.time() - a) / (max(END_EVENT, i) - START_EVENT))
    return fail, success
def partition_ids():
    for i, (id, fb_event) in enumerate(processing.all_fb_data([], filename='local_data/PotentialFBEvents.csv')):
        e = event_classifier.get_classified_event(fb_event)
        result = event_auto_classifier.is_battle(e)
        if result[0]:
            success(e, fb_event, result)
        else:
            failure(e, fb_event, result)
Esempio n. 4
0
def mp_partition_ids(ids, classifier=lambda x: False):
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count(), initializer=init_worker)
    print "Generating data..."
    data = [(classifier, x) for x in processing.all_fb_data(ids)]
    print "Running multiprocessing classifier..."
    async_results = pool.map_async(mp_classify, data, chunksize=100)
    # We need to specify a timeout to get(), so that KeyboardInterrupt gets delivered properly.
    results = async_results.get(9999999)
    print "Multiprocessing classifier completed."
    successes = set(x[1] for x in results if x[0])
    fails = set(x[1] for x in results if not x[0])
    return successes, fails
Esempio n. 5
0
def partition_ids(ids, classifier=lambda x: False):
    successes = set()
    fails = set()
    for i, (id, fb_event) in enumerate(processing.all_fb_data(ids)):
        if not i % 10000: print 'Processing ', i
        result = classifier(fb_event)
        if result:
            successes.add(id)
            #if id not in good_ids:
            #    # false positive
            #    add_counts(false_positive_counts, fb_event)
        else:
            fails.add(id)
            #if id not in bad_ids:
            #    # false negative
            #    add_counts(false_negative_counts, fb_event)
        #if id in good_ids:
        #    add_counts(good_counts, fb_event)
        #else:
        #    add_counts(bad_counts, fb_event)
    return successes, fails
Esempio n. 6
0
                rules[name] = var_value
    return rules


# These are the regexes that will be our feature detectors
named_rules = {}
named_rules.update(get_magic_rules(rules))
named_rules.update(get_magic_rules(keywords))
named_rules['nlp.rules.MANUAL_DANCE[grammar.STRONG]'] = rules.MANUAL_DANCE[grammar.STRONG]
named_rules['nlp.rules.MANUAL_DANCE[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCE[grammar.STRONG_WEAK]
named_rules['nlp.rules.MANUAL_DANCER[grammar.STRONG]'] = rules.MANUAL_DANCER[grammar.STRONG]
named_rules['nlp.rules.MANUAL_DANCER[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCER[grammar.STRONG_WEAK]
named_rules_list = sorted(named_rules.items())
all_ids = processing.load_all_ids()
training_data = processing.load_classified_ids(all_ids)
loaded_data = processing.all_fb_data(all_ids)

print 'loaded ids'


class Bunch(object):
    pass


train = Bunch()


def process(fb_event):
    return '%s\n\n%s' % (fb_event['info'].get('name'), fb_event['info'].get('description'))

Esempio n. 7
0
        c = event_auto_classifier.is_auto_add_event(e)
        result = c.result
    else:
        result = event_auto_classifier.is_auto_notadd_event(e)
    # classified as good, but not supposed to be in the good set of ids:
    if result[0] and fb_event['info']['id'] not in training_data.good_ids:
        # false positive
        print fb_event['info']['id'], result
    if not full_run:
        print fb_event['info']['id'], result
    return result[0]


a = time.time()
print "Running auto classifier..."
fb_data = processing.all_fb_data(trial_ids)
# Input fb_data is [(id, fb_event), (id, fb_event)]
# Result will be positive ids and negative ids
classifier_data = processing.partition_data(fb_data, classifier=basic_match)
print "done, %d seconds" % (time.time() - a)

score_card = processing.ClassifierScoreCard(training_data, classifier_data, positive_classifier)

print "Found %s true-positives, %s false-positives" % (len(score_card.true_positives), len(score_card.false_positives))
print "Leaves %s to be manually-classified" % (len(score_card.false_negatives))

if full_run:
    score_card.write_to_disk('scratch/')

for id in score_card.false_positives:
    print 'F', id
Esempio n. 8
0
named_rules.update(get_magic_rules(rules))
named_rules.update(get_magic_rules(keywords))
named_rules['nlp.rules.MANUAL_DANCE[grammar.STRONG]'] = rules.MANUAL_DANCE[
    grammar.STRONG]
named_rules[
    'nlp.rules.MANUAL_DANCE[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCE[
        grammar.STRONG_WEAK]
named_rules['nlp.rules.MANUAL_DANCER[grammar.STRONG]'] = rules.MANUAL_DANCER[
    grammar.STRONG]
named_rules[
    'nlp.rules.MANUAL_DANCER[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCER[
        grammar.STRONG_WEAK]
named_rules_list = sorted(named_rules.items())
all_ids = processing.load_all_ids()
training_data = processing.load_classified_ids(all_ids)
loaded_data = processing.all_fb_data(all_ids)

print 'loaded ids'


class Bunch(object):
    pass


train = Bunch()


def process(fb_event):
    return '%s\n\n%s' % (fb_event['info'].get('name'),
                         fb_event['info'].get('description'))