예제 #1
0
def classify(event_instances,train_events,test_events):
    if args.median:
        #generate feature_tte list
        print "generating feature_tte list"
        feature_tte = defaultdict(list)
        for ev in train_events:
            for tweet in event_instances[1][ev]:
                for feature in tweet["features"]:
                    if re.search(r"timex_",feature):
                        try:
                            feature_tte[feature].append(int(tweet["label"]))
                            # print "after","_".join(feature.split("_")[:-1])
                        except:
                            continue
        #calculate_median
        print "calculating median"
        feature_new = {}
        for feature in feature_tte.keys():
            if gen_functions.return_standard_deviation(feature_tte[feature]) < 2 and len(feature_tte[feature]) >= 2:
                feature_new[feature] = str(int(numpy.median(feature_tte[feature]))) + "_days"
                if args.median_out:
                    median_out.write(feature + "\t" + feature_new[feature] + "\n")
#               print feature,feature_tte[feature],feature_new[feature]
            else:
                feature_new[feature] = feature
        #convert features
        print "converting features"
        for ev in train_events:
            for instance in event_instances[0][ev]:
                new_features = []
                for r,feature in enumerate(instance["features"]):
                    if re.search(r"timex_",feature):
                        featureo = "_".join(feature.split("_")[:-1])
                        if not re.search(r"timex_",feature_new[featureo]):
                            extra_reg = int(feature.split("_")[-1])
                            new_feature = str(int(feature_new[featureo].split("_")[0]) + extra_reg) + "_days"
                            new_features.append(new_feature)
                    else:
                        new_features.append(feature)
                instance["features"] = new_features
        for ev in test_events:
            for instance in event_instances[0][ev]:
                new_features = []
                for r,feature in enumerate(instance["features"]):
                    if re.search(r"timex_",feature):
                        featureo = "_".join(feature.split("_")[:-1])
                        try:
                            if not re.search(r"timex_",feature_new[featureo]):
                                extra_reg = int(feature.split("_")[-1])
                                new_feature = str(int(feature_new[featureo].split("_")[0]) + extra_reg) + "_days"
                                new_features.append(new_feature)
#                                if re.search("ajaaz",ev):
#                                    print feature,new_feature
                        except:
                            continue
                    else:
                        new_features.append(feature)
                instance["features"] = new_features

    train = sum([event_instances[0][x] for x in train_events],[])
    test = []
    for event in test_events:
        print event
        testdict = {}
        eventparts = event.split("/") + [args.scaling]
        eventdir = args.d 
        for part in eventparts:
            eventdir = eventdir + part + "/"
            if not os.path.exists(eventdir):
                os.system("mkdir " + eventdir)
        print eventdir
        if args.majority:
            eventout = eventdir + "tweet.txt"
        else:
            eventout = eventdir + str(args.window) + "_" + str(args.step) + ".txt"
        testdict["out"] = eventout
        testdict["instances"] = event_instances[0][event]
        test.append(testdict)
    if args.c == "median_baseline":
        for td in test:
            outfile = open(td["out"],"w")
            instances = td["instances"]
            for instance in instances:
                #extract day_estimations
                ests = []
                labelcount = defaultdict(int)
                for feature in instance["features"]:
                    if re.search(r"days",feature):
                       ests.append(feature)
                if len(ests) > 0:
                    for est in ests:
                        labelcount[est] += 1
                    topest = [e for e in sorted(labelcount, key=labelcount.get, reverse=True)][0]
                    num = re.search(r"(-?\d+)_days",topest).groups()[0]
                else:
                    num = "during"
#                if re.search("ajaaz",td["out"]):
  #                  if re.search("fall_11",td["out"]):
   #                     print instance["features"],num
                outfile.write(instance["label"] + " " + str(num) + "\n")
            outfile.close() 
    else:
        #set up classifier object
        if args.jobs:
            cl = Classifier(train,test,jobs=args.jobs,scaling=args.scaling)
        else:
            cl = Classifier(train,test,scaling=args.scaling)
        if args.stdev:
            cl.filter_stdev(args.stdev, "timex_")
        if args.balance:
            print "balancing..."
            cl.balance_data()
        print "counting..."
        cl.count_feature_frequency()
        if args.f:
            print "pruning..."
            cl.prune_features_topfrequency(args.f)
        #generate sparse input
        print "indexing..."
        cl.index_features()
        #generate classifiers
        print "classifying..."
        if args.c == "svm":
            if args.cw:
                cl.classify_svm(classweight="auto")
            else:
                cl.classify_svm()
        elif args.c == "svr":
            print "svr"
            if args.cw:
                cl.classify_svm(t="continuous",classweight="auto")
            else:
                cl.classify_svm(t="continuous")