def classify(event_instances,train_events,test_events): if args.median: #generate feature_tte list print "generating feature_tte list" feature_tte = defaultdict(list) for ev in train_events: for tweet in event_instances[1][ev]: for feature in tweet["features"]: if re.search(r"timex_",feature): try: feature_tte[feature].append(int(tweet["label"])) # print "after","_".join(feature.split("_")[:-1]) except: continue #calculate_median print "calculating median" feature_new = {} for feature in feature_tte.keys(): if gen_functions.return_standard_deviation(feature_tte[feature]) < 2 and len(feature_tte[feature]) >= 2: feature_new[feature] = str(int(numpy.median(feature_tte[feature]))) + "_days" if args.median_out: median_out.write(feature + "\t" + feature_new[feature] + "\n") # print feature,feature_tte[feature],feature_new[feature] else: feature_new[feature] = feature #convert features print "converting features" for ev in train_events: for instance in event_instances[0][ev]: new_features = [] for r,feature in enumerate(instance["features"]): if re.search(r"timex_",feature): featureo = "_".join(feature.split("_")[:-1]) if not re.search(r"timex_",feature_new[featureo]): extra_reg = int(feature.split("_")[-1]) new_feature = str(int(feature_new[featureo].split("_")[0]) + extra_reg) + "_days" new_features.append(new_feature) else: new_features.append(feature) instance["features"] = new_features for ev in test_events: for instance in event_instances[0][ev]: new_features = [] for r,feature in enumerate(instance["features"]): if re.search(r"timex_",feature): featureo = "_".join(feature.split("_")[:-1]) try: if not re.search(r"timex_",feature_new[featureo]): extra_reg = int(feature.split("_")[-1]) new_feature = str(int(feature_new[featureo].split("_")[0]) + extra_reg) + "_days" new_features.append(new_feature) # if re.search("ajaaz",ev): # print feature,new_feature except: continue else: new_features.append(feature) instance["features"] = new_features train = sum([event_instances[0][x] for x in train_events],[]) test = [] for event in test_events: print event testdict = {} eventparts = event.split("/") + [args.scaling] eventdir = args.d for part in eventparts: eventdir = eventdir + part + "/" if not os.path.exists(eventdir): os.system("mkdir " + eventdir) print eventdir if args.majority: eventout = eventdir + "tweet.txt" else: eventout = eventdir + str(args.window) + "_" + str(args.step) + ".txt" testdict["out"] = eventout testdict["instances"] = event_instances[0][event] test.append(testdict) if args.c == "median_baseline": for td in test: outfile = open(td["out"],"w") instances = td["instances"] for instance in instances: #extract day_estimations ests = [] labelcount = defaultdict(int) for feature in instance["features"]: if re.search(r"days",feature): ests.append(feature) if len(ests) > 0: for est in ests: labelcount[est] += 1 topest = [e for e in sorted(labelcount, key=labelcount.get, reverse=True)][0] num = re.search(r"(-?\d+)_days",topest).groups()[0] else: num = "during" # if re.search("ajaaz",td["out"]): # if re.search("fall_11",td["out"]): # print instance["features"],num outfile.write(instance["label"] + " " + str(num) + "\n") outfile.close() else: #set up classifier object if args.jobs: cl = Classifier(train,test,jobs=args.jobs,scaling=args.scaling) else: cl = Classifier(train,test,scaling=args.scaling) if args.stdev: cl.filter_stdev(args.stdev, "timex_") if args.balance: print "balancing..." cl.balance_data() print "counting..." cl.count_feature_frequency() if args.f: print "pruning..." cl.prune_features_topfrequency(args.f) #generate sparse input print "indexing..." cl.index_features() #generate classifiers print "classifying..." if args.c == "svm": if args.cw: cl.classify_svm(classweight="auto") else: cl.classify_svm() elif args.c == "svr": print "svr" if args.cw: cl.classify_svm(t="continuous",classweight="auto") else: cl.classify_svm(t="continuous")