def partition_ids(old_true_positive, old_true_negative, classifier=event_classifier.classified_event_from_fb_event): success = set() fail = set() a = time.time() for i, (id, fb_event) in enumerate(processing.all_fb_data(combined_ids)): if not i % 10000: print 'Processing ', i if i < START_EVENT: continue if END_EVENT and i > END_EVENT: break result = classifier(fb_event) result.classify() if result.is_dance_event(): success.add(id) if id in old_true_negative: handle_new_false_positive(id, result) else: # To print out failures, to see if there's any way we can better detect them #if id in good_ids: # print id, fb_event['info'].get('name') # print result.found_dance_matches, result.found_event_matches, result.found_wrong_matches fail.add(id) if id in old_true_positive: handle_new_false_negative(id, result) print 'Time per event: %s' % (1.0 * (time.time() - a) / (max(END_EVENT, i) - START_EVENT)) return fail, success
def partition_ids(): for i, (id, fb_event) in enumerate(processing.all_fb_data([], filename='local_data/PotentialFBEvents.csv')): e = event_classifier.get_classified_event(fb_event) result = event_auto_classifier.is_battle(e) if result[0]: success(e, fb_event, result) else: failure(e, fb_event, result)
def mp_partition_ids(ids, classifier=lambda x: False): pool = multiprocessing.Pool(processes=multiprocessing.cpu_count(), initializer=init_worker) print "Generating data..." data = [(classifier, x) for x in processing.all_fb_data(ids)] print "Running multiprocessing classifier..." async_results = pool.map_async(mp_classify, data, chunksize=100) # We need to specify a timeout to get(), so that KeyboardInterrupt gets delivered properly. results = async_results.get(9999999) print "Multiprocessing classifier completed." successes = set(x[1] for x in results if x[0]) fails = set(x[1] for x in results if not x[0]) return successes, fails
def partition_ids(ids, classifier=lambda x: False): successes = set() fails = set() for i, (id, fb_event) in enumerate(processing.all_fb_data(ids)): if not i % 10000: print 'Processing ', i result = classifier(fb_event) if result: successes.add(id) #if id not in good_ids: # # false positive # add_counts(false_positive_counts, fb_event) else: fails.add(id) #if id not in bad_ids: # # false negative # add_counts(false_negative_counts, fb_event) #if id in good_ids: # add_counts(good_counts, fb_event) #else: # add_counts(bad_counts, fb_event) return successes, fails
rules[name] = var_value return rules # These are the regexes that will be our feature detectors named_rules = {} named_rules.update(get_magic_rules(rules)) named_rules.update(get_magic_rules(keywords)) named_rules['nlp.rules.MANUAL_DANCE[grammar.STRONG]'] = rules.MANUAL_DANCE[grammar.STRONG] named_rules['nlp.rules.MANUAL_DANCE[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCE[grammar.STRONG_WEAK] named_rules['nlp.rules.MANUAL_DANCER[grammar.STRONG]'] = rules.MANUAL_DANCER[grammar.STRONG] named_rules['nlp.rules.MANUAL_DANCER[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCER[grammar.STRONG_WEAK] named_rules_list = sorted(named_rules.items()) all_ids = processing.load_all_ids() training_data = processing.load_classified_ids(all_ids) loaded_data = processing.all_fb_data(all_ids) print 'loaded ids' class Bunch(object): pass train = Bunch() def process(fb_event): return '%s\n\n%s' % (fb_event['info'].get('name'), fb_event['info'].get('description'))
c = event_auto_classifier.is_auto_add_event(e) result = c.result else: result = event_auto_classifier.is_auto_notadd_event(e) # classified as good, but not supposed to be in the good set of ids: if result[0] and fb_event['info']['id'] not in training_data.good_ids: # false positive print fb_event['info']['id'], result if not full_run: print fb_event['info']['id'], result return result[0] a = time.time() print "Running auto classifier..." fb_data = processing.all_fb_data(trial_ids) # Input fb_data is [(id, fb_event), (id, fb_event)] # Result will be positive ids and negative ids classifier_data = processing.partition_data(fb_data, classifier=basic_match) print "done, %d seconds" % (time.time() - a) score_card = processing.ClassifierScoreCard(training_data, classifier_data, positive_classifier) print "Found %s true-positives, %s false-positives" % (len(score_card.true_positives), len(score_card.false_positives)) print "Leaves %s to be manually-classified" % (len(score_card.false_negatives)) if full_run: score_card.write_to_disk('scratch/') for id in score_card.false_positives: print 'F', id
named_rules.update(get_magic_rules(rules)) named_rules.update(get_magic_rules(keywords)) named_rules['nlp.rules.MANUAL_DANCE[grammar.STRONG]'] = rules.MANUAL_DANCE[ grammar.STRONG] named_rules[ 'nlp.rules.MANUAL_DANCE[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCE[ grammar.STRONG_WEAK] named_rules['nlp.rules.MANUAL_DANCER[grammar.STRONG]'] = rules.MANUAL_DANCER[ grammar.STRONG] named_rules[ 'nlp.rules.MANUAL_DANCER[grammar.STRONG_WEAK]'] = rules.MANUAL_DANCER[ grammar.STRONG_WEAK] named_rules_list = sorted(named_rules.items()) all_ids = processing.load_all_ids() training_data = processing.load_classified_ids(all_ids) loaded_data = processing.all_fb_data(all_ids) print 'loaded ids' class Bunch(object): pass train = Bunch() def process(fb_event): return '%s\n\n%s' % (fb_event['info'].get('name'), fb_event['info'].get('description'))