Example #1
0
    def __init__(self, name, epsilon=0.05, gamma=0.8, alpha=0.2, numTraining=0):
        self.name = name
        self.cards = [] #(cardValue, cardElement)
        self.accumulatedCards = {"Fire": 0, "Water": 0, "Ice": 0}
        self.playedCard = None

        self.args = {}
        self.args['epsilon'] = epsilon
        self.args['gamma'] = gamma
        self.args['alpha'] = alpha
        self.args['numTraining'] = numTraining
        self.weights = Counter()

        # self.weights["enemy-distance-to-closest-win"] = 1.3999995454998298e-06 
        # self.weights["agent-distance-to-closest-win"] = 1.299999463999758e-06

        self.weights["enemy-distance-to-closest-win"] = -4.120535635213156 
        self.weights["agent-distance-to-closest-win"] = 9.586679017815417 
        self.weights["agent-went-closer-to-win"] = -0.9656494587969497 
        self.weights["agent-can-block-enemy-advancement"] = 15.147299275663869 


        self.featExtractor = FeatureExtractor()
        self.lastState = None
        self.lastAction = None
        self.lastScore = 0
Example #2
0
 def __init__(self, vehicle):
     print("Trip Starts")
     self.vehicle = vehicle
     self.timer = 0
     self.data = []
     self.featureExtractor = FeatureExtractor()
     self.tripNo = 1
    def load_datasets(balance=False, viral_threshold=0):
        """
        Return the training and testing datasets containing
        the tweets featured followed by the retweet count
        """
        # Import data
        _, features, virality = FeatureExtractor.load(force=True)
        print "Building datasets..."
        # Concatenate the arrays into one along the second axis
        data = np.c_[features, np.array(virality)[:, 0]]

        RegressionModel.__dataset_range(data)
        # Duplicate viral tweets to balance the dataset
        if balance:
            data = RegressionModel.__balance_virality(
                dataset=data, threshold=viral_threshold)
        # Shuffle data
        np.random.shuffle(data)
        # Split dataset into training and testing sets
        size = int(len(data) * RegressionModel.TRAINING_SIZE)

        # why  was the test set having overlap with the traning set earlier.
        training_set = data[:size]
        testing_set = data[size:]

        return training_set, testing_set
    def load_datasets(balance=False, viral_threshold=0):
        """
        Return the training and testing datasets containing
        the tweets featured followed by the retweet count
        """
        # Import data
        _, features, virality = FeatureExtractor.load(force=True)
        print "Building datasets..."
        # Concatenate the arrays into one along the second axis
        data = np.c_[features, np.array(virality)[:, 0]]

        RegressionModel.__dataset_range(data)
        # Duplicate viral tweets to balance the dataset
        if balance:
            data = RegressionModel.__balance_virality(dataset=data, threshold=viral_threshold)
        # Shuffle data
        np.random.shuffle(data)
        # Split dataset into training and testing sets
        size = int(len(data) * RegressionModel.TRAINING_SIZE)

        # why  was the test set having overlap with the traning set earlier.
        training_set = data[:size]
        testing_set = data[size:]

        return training_set, testing_set
Example #5
0
    def testExtration(self):
        featureExt = FeatureExtractor()
        agent = Player("aql agent")
        enemy = Player("greedy agent")
        gameState = GameState(agent, enemy)
        enemy.accumulatedCards["Water"] += 1
        enemy.accumulatedCards["Fire"] += 1
        features = featureExt.getFeatures(gameState, "action", agent.name)
        self.assertEqual(features["enemy-distance-to-closest-win"], 1)
        self.assertEqual(features["agent-distance-to-closest-win"], 4)

        agent.cards.append((1, "Water"))
        enemy.accumulatedCards["Fire"] -= 1
        enemy.accumulatedCards["Water"] += 1

        features = featureExt.getFeatures(gameState, "action", agent.name)
        self.assertEqual(features["agent-distance-to-closest-win"], 3)
        self.assertEqual(features["enemy-distance-to-closest-win"], 1)
Example #6
0
 def load_datasets(balance=False, viral_threshold=0):
     """
     Return the datasets containing the tweets featured followed by the retweet count
     """
     # Import data
     _, features, virality = FeatureExtractor.load(force=True, keepTweetWithoutHashtags=False)
     print "Building datasets..."
     # Concatenate the arrays into one along the second axis
     data = np.c_[features, np.array(virality)[:, 0]]
     return pd.DataFrame(data, columns=(FeatureExtractor.FEATURE_LABEL + FeatureExtractor.VIRALITY_LABEL))
 def load_datasets(balance=False, viral_threshold=0):
     """
     Return the datasets containing the tweets featured followed by the retweet count
     """
     # Import data
     _, features, virality = FeatureExtractor.load(force=True, keepTweetWithoutHashtags=False)
     print "Building datasets..."
     # Concatenate the arrays into one along the second axis
     data = np.c_[features, np.array(virality)[:, 0]]
     return pd.DataFrame(data, columns=(FeatureExtractor.FEATURE_LABEL+FeatureExtractor.VIRALITY_LABEL))
Example #8
0
def create_generators():
    # creating training generator
    preloader = MatrixPreLoader(dataset_directory=training_filepath,
                                patients_to_use="ALL",
                                activity_types=activities_to_load,
                                print_loading_progress=False)
    matrix_data_generator = MatrixDataGenerator(preloader,
                                                matrix_dimensions=(224, 224),
                                                rgb=True,
                                                twoD=False,
                                                add_gaussian_noise=0,
                                                zero_sensors=0,
                                                batch_size=32,
                                                grab_data_from=(0, 1),
                                                overflow="BEFORE",
                                                print_loading_progress=False)
    training_generator = FeatureExtractor(matrix_data_generator,
                                          patient_fall_filepath,
                                          weigths_filepath,
                                          test=True)

    # create testing generator
    preloader = MatrixPreLoader(dataset_directory=testing_filepath,
                                patients_to_use="ALL",
                                activity_types=activities_to_load,
                                print_loading_progress=False)
    matrix_data_generator = MatrixDataGenerator(preloader,
                                                matrix_dimensions=(224, 224),
                                                rgb=True,
                                                twoD=False,
                                                add_gaussian_noise=0,
                                                zero_sensors=3,
                                                batch_size=50,
                                                grab_data_from=(0, 1),
                                                overflow="BEFORE",
                                                print_loading_progress=False)
    testing_generator = FeatureExtractor(matrix_data_generator,
                                         patient_fall_filepath,
                                         weigths_filepath,
                                         test=False)

    return training_generator, testing_generator
Example #9
0
 def extractFeatures(self, train_data, test_data):
     # Construct Feature Extractor
     fe = FeatureExtractor()
     fe.buildVectorizer(train_data, self.config['featureKwargs'])
     # Make feature path if it doesnt exist
     if not os.path.exists(self.feature_path):
         os.mkdir(self.feature_path)
     # Check if train vectors already exist
     if os.path.exists(os.path.join(self.feature_path, 'train_vectors.npz')):
         # If it does, load them
         train_vectors = load_npz(os.path.join(self.feature_path, 'train_vectors.npz'))
     else:
         # Make the train vectors
         train_vectors = [fe.process(feature, train_data) for feature in self.config['features']]
         if len(train_vectors) > 1:
             train_vectors = numpy.concatenate(train_vectors, axis=1)
         else:
             train_vectors = train_vectors[0]
         # Save the train vectors
         save_npz(os.path.join(self.feature_path, 'train_vectors.npz'), train_vectors)
     # Check if test vectors already exist
     if os.path.exists(os.path.join(self.feature_path, 'test_vectors.npz')):
         # If it does, load them
         test_vectors = load_npz(os.path.join(self.feature_path, 'test_vectors.npz'))
     else:
         # Make the test vectors
         test_vectors = [fe.process(feature, test_data) for feature in self.config['features']]
         if len(test_vectors) > 1:
             test_vectors = numpy.concatenate(test_vectors, axis=1)
         else:
             test_vectors = test_vectors[0]
         # Save the test vectors
         save_npz(os.path.join(self.feature_path, 'test_vectors.npz'), test_vectors)
     return train_vectors, test_vectors
Example #10
0
    def func4(self):

        vp = ViralityPrediction(normalize=True,
                                balance=True,
                                tweet_threshold=50000,
                                score=False,
                                dump_model=False)
        hashtagIndex = HashtagIndex()

        virality = {}
        hashtags_features = {}
        hashtags_virality = {}

        hashtags = [
            k for (k, v) in hashtagIndex.items(
                sort=True, descending=True, min_values=100)
        ]
        print "Extracting features..."
        for hashtag in hashtags:
            _, featureList, vir = FeatureExtractor.loadFromDB(
                tweets_id=hashtagIndex.find(hashtag))
            hashtags_features[hashtag] = featureList
            hashtags_virality[hashtag] = vir
            virality[hashtag] = sum(np.array(vir)[:, 0])

        # Sort predictions by value and print top-K results
        predictions = vp.predict(hashtags_features)
        sorted_predictions = sorted(predictions.items(),
                                    key=operator.itemgetter(1),
                                    reverse=True)
        print "\nTop " + str(ViralityPrediction.K) + " virality predictions:"
        print sorted_predictions
        for i in range(0, min(ViralityPrediction.K, len(sorted_predictions))):
            print sorted_predictions[i]
            listbox.insert(END, sorted_predictions[i])
        listbox.pack()
Example #11
0
 def extractFeatures(self, train_data, test_data):
     #Extract Features and pass them as concatenated arrays
     fe = FeatureExtractor(self.config['features'],
                           self.config['featurePath'],
                           self.config['featureKwargs'])
     fe.buildVectorizer(train_data)
     #Check for ALready done work
     if path.exists(self.config['featurePath'] + "train_data.pickle"):
         print("here's the error?")
         with open(self.config['featurePath'] + "train_data.pickle",
                   "rb") as file:
             train_vectors = pickle.load(file)
     else:
         train_vectors = fe.process(train_data)
         with open(self.config['featurePath'] + "train_data.pickle",
                   "wb+") as file:
             pickle.dump(train_vectors, file)
     if len(train_vectors) > 1:
         print("took option A")
         train_vectors = numpy.concatenate(train_vectors, axis=1)
     else:
         print("took option B")
         train_vectors = train_vectors[0]
     print(train_vectors.shape)
     print(train_vectors[1, :])
     #Check for ALready done work
     if path.exists(self.config['featurePath'] + "test_data.pickle"):
         with open(self.config['featurePath'] + "test_data.pickle",
                   "rb") as file:
             test_vectors = pickle.load(file)
     else:
         test_vectors = fe.process(test_data)
         with open(self.config['featurePath'] + "test_data.pickle",
                   "wb+") as file:
             pickle.dump(test_vectors, file)
     if len(test_vectors) > 1:
         test_vectors = numpy.concatenate(test_vectors, axis=1)
     else:
         test_vectors = test_vectors[0]
     return train_vectors.toarray(), test_vectors.toarray()
Example #12
0
# The pre-computed features can also be downloiad from http://iamai.nl/downloads/features.npy
if not isfile(featurePath):
    print("indexing images...")
    Steles = [
        join(stelePath, f) for f in listdir(stelePath)
        if isdir(join(stelePath, f))
    ]
    for stele in Steles:
        imagePaths = [
            join(stele, f) for f in listdir(stele) if isfile(join(stele, f))
        ]
        for path in imagePaths:
            image_paths.append(path)
            labels.append(path[(path.rfind("_") + 1):path.rfind(".")])

    featureExtractor = FeatureExtractor()
    features = []
    print("computing features...")
    for idx, (batch_images,
              _) in enumerate(batchGenerator(image_paths, labels, batch_size)):
        print("{}/{}".format((idx + 1) * batch_size, len(labels)))
        features_ = featureExtractor.get_features(batch_images)
        features.append(features_)
    features = np.vstack(features)

    labels = np.asarray(labels)
    print("saving features...")
    np.save(featurePath, features)
    np.save(labelsPath, labels)
else:
    print("loading precomputed features and labels from {} and {}".format(
Example #13
0
from NeuralNetwork import NeuralNetwork
from featureExtractor import FeatureExtractor
import numpy as np
from DataLoader import DataLoader
import configure

#best results:
#(0.05, 60, 30, 30)
#0.5428571428571428
#(0.1, 50, 60, 30)
#0.5714285714285714
#(0.1, 300, 100, 30)
#0.6
fe = FeatureExtractor("generatedData/eigenfaces.csv",
                      "generatedData/average_face.csv")
best = 0.0
for lr in range(4, 7, 1):
    for ne in range(380, 421, 5):
        for nhn in range(40, 91, 10):
            for nev in range(50, 91, 10):
                configure.setUpConfig(lr / 100, ne, nhn, nev)

                #prepare data for training:
                dl = DataLoader(configure.config_global.modeTrain)
                dl.load_all_images()
                datasetTrain = fe.generate_dataset(dl.images)

                #train NN:
                nn = NeuralNetwork(configure.config_global.noOfEigenValues,
                                   configure.config_global.noOfHidNeur)
                nn.trainNetwork(datasetTrain)
Example #14
0
class ApproximateQLearningAgent(Player):
    def __init__(self, name, epsilon=0.05, gamma=0.8, alpha=0.2, numTraining=0):
        self.name = name
        self.cards = [] #(cardValue, cardElement)
        self.accumulatedCards = {"Fire": 0, "Water": 0, "Ice": 0}
        self.playedCard = None

        self.args = {}
        self.args['epsilon'] = epsilon
        self.args['gamma'] = gamma
        self.args['alpha'] = alpha
        self.args['numTraining'] = numTraining
        self.weights = Counter()

        # self.weights["enemy-distance-to-closest-win"] = 1.3999995454998298e-06 
        # self.weights["agent-distance-to-closest-win"] = 1.299999463999758e-06

        self.weights["enemy-distance-to-closest-win"] = -4.120535635213156 
        self.weights["agent-distance-to-closest-win"] = 9.586679017815417 
        self.weights["agent-went-closer-to-win"] = -0.9656494587969497 
        self.weights["agent-can-block-enemy-advancement"] = 15.147299275663869 


        self.featExtractor = FeatureExtractor()
        self.lastState = None
        self.lastAction = None
        self.lastScore = 0
    
    def resetForNewGame(self):
        self.cards = []
        self.accumulatedCards = {"Fire": 0, "Water": 0, "Ice": 0}
        self.playedCard = None
        self.lastState = None
        self.lastAction = None
        self.lastScore = 0


    def pickCard(self, card):
        # Takes in the index of the card to be picked in self.cards
        if card not in self.cards:
            print(card)
            print(self.cards)
            raise ValueError('Picked card not in current cards!')

        self.playedCard = card
        self.cards.remove(card)
        return

    def getLegalActions(self, gameState):
        if self.name == gameState.p1.name:
            return gameState.p1.cards
        else:
            return gameState.p2.cards
        return self.cards

        
    def getQValue(self, gameState, action):
        """
          Should return Q(gameState,action) = w * featureVector
          where * is the dotProduct operator
        """
        result = 0
        features = self.featExtractor.getFeatures(gameState, action, self.name)
        for feature in features:
          result += features[feature] * self.weights[feature]
        return result

    def flipCoin(self, prob):
        r = random.random()
        return r < prob

    def computeActionFromQValues(self, gameState):
        actions = self.getLegalActions(gameState)
        max_action = None
        max_q_val = float("inf")
        if not actions:
            return None
        for a in actions:
            q_val = self.getQValue(gameState, a)
            if q_val < max_q_val:
                max_q_val = q_val
                max_action = a
        if max_action is None:
            return random.choice(actions)
        else:
            return max_action  

    def doAction(self, gameState):
        legalActions = self.getLegalActions(gameState)
        action = None
        "*** YOUR CODE HERE ***"
        # if not legalActions:
        #     action = None
        # else:
        coinflip = self.flipCoin(self.args["epsilon"])
        if coinflip:
            action = random.choice(legalActions)
        elif not coinflip:
            action = self.computeActionFromQValues(gameState)
        
        # print("setting lastState")
        self.lastState = copy.deepcopy(gameState) 
        # print("self.lastState",self.lastState)
        self.lastAction = copy.deepcopy(action)

        return action

    def update(self, gameState, score):
        """
           Should update your weights based on transition
        """
        # deleted action, nextState, reward from params
        # print("self.lastState",self.lastState)
        if self.lastState is not None:
            state, action, nextState, deltaReward = self.lastState, self.lastAction, gameState, score - self.lastScore

            actions = self.getLegalActions(nextState)
            max_qval_action = (float("-inf"), None)
            if not actions:
                max_qval_action = 0
            elif actions:
                for a in actions:
                    #! going through all the actions to get max action
                    q_val = self.getQValue(nextState, a)
                    max_qval_action = max(max_qval_action, (q_val, a), key=lambda x: x[0])
            difference = deltaReward + self.args['gamma'] * max_qval_action[0] - self.getQValue(state, action)
            
            features = self.featExtractor.getFeatures(state, action, self.name)
            for feature in features:
            # print(feature, "weights:", self.weights[feature])
                self.weights[feature] = self.weights[feature] + self.args['alpha'] * difference * features[feature]
        
        self.lastScore = score

    def printEpisodeInfo(self):
        print(bcolors.OKBLUE + "AQL score:", str(self.lastScore))
        print(bcolors.OKBLUE + "AQL accumulated cards:", str(self.accumulatedCards))
        # print(bcolors.OKBLUE + "AQL weights:")

        # for key, value in self.weights.items():
        #     print("     ",key, value)

        print( bcolors.ENDC)
Example #15
0
def main():
    """" Preprocesses, extracts, learns, tests"""

    # process flags
    do_retrain, do_rebuildValidation, do_test = False, False, False

    for arg in sys.argv[1:]:
        if ("--retrain" in arg):
            if ("yes" in arg):
                do_retrain = True
        if ("--rebuildValidation" in arg):
            if ("yes" in arg):
                do_rebuildValidation = True
        if ("--test" in arg):
            if ("yes" in arg):
                do_test = True

    # preprocessing
    do = DataOrganizer()

    # __________________________________ TRAINING ________________________ #

    # use BoG to convert to frequency vector

    fe = FeatureExtractor(FeatureExtractor.ModelType.BagOfClusters)

    clf = 0
    clf_file = ""

    # get the latest trained model
    filenames = os.listdir("models/")
    if len(filenames) > 0:
        clf_file = "models/" + filenames[-1]
    else:
        clf_file = None

    # get sets of tweets as training data
    # trainData0, trainData1, validation0, validation1 \
    #     = do.organizeTrainWithValidation("data/trainValidate/", do_rebuildValidation)

    trainData0, trainData1 = do.organizeTrain("data/train/")

    if do_retrain or not clf_file:
        # split training set into validation and training set
        X0, X1 = fe.extractTrainFeatureVectors((trainData0, trainData1))
        clf = learn(X0, X1)

        millis = int(round(time.time() * 1000))
        clf_file = "trainedModel" + str(millis)
        print "Saving model to file..."

        joblib.dump(clf, "models/" + clf_file, compress=1)
    else:
        print "Using trained model and BoG..."
        fe.bog = BagOfWords()
        fe.bog.getLatestBoG()
        clf = joblib.load(clf_file)

    # we're either validating or testing based on the passed flag

    # ____________________________________VALIDATION__________________________#
    if not do_test:
        # feed in the validation sets as one set
        validationData = do.organizeTest("data/validation/")
        validationFeatures, validationLabels = fe.extractTestFeatureVectors(
            validationData)
        test("Validation", clf, validationFeatures, validationLabels)
    else:
        # ____________________________________TESTING _______________________ #

        # extract test features and test
        print "Using testing"
        testData, testLabels = do.organizeTest("data/test/")
        testFeatures = fe.extractTestFeatureVectors(testData)
        test("Testing, Global Protests With Background Subtraction", clf,
             testFeatures, testLabels)
Example #16
0
def registrate(drone_img_ori, pcl_img_ori, mask_image, args):
    common_args = {
        'pcl_mask': args.pcl_mask,
        'drone_mask': args.drone_mask,
        'save_masked_pcl': args.save_masked_pcl,
        'save_masked_drone': args.save_masked_drone,
        'save_keypoints': args.save_keypoints,
        'save_csv': args.save_csv,
        'save_matching': args.save_matching
    }

    result = {}

    # Preprocess Images
    img_preprocessor = Preprocessor(drone_img_ori, pcl_img_ori, mask_image)
    img_preprocessor.preprocessing()
    imgs = img_preprocessor.get_processed_imgs()

    processed_drone_img = imgs['processed_drone_img']
    processed_pcl_img = imgs['processed_pcl_img']
    processed_drone_mask = imgs['processed_drone_mask']
    processed_pcl_mask = imgs['processed_pcl_mask']
    masked_drone_img = imgs['masked_drone_img']
    masked_pcl_img = imgs['masked_pcl_img']

    if common_args['save_masked_pcl'] is True:
        result.update({'masked_pcl': masked_pcl_img})

    if common_args['save_masked_drone'] is True:
        result.update({'masked_drone': masked_drone_img})

    # Extract Features
    drone_feature_extractor = FeatureExtractor(processed_drone_img, "SIFT",
                                               args)
    pcl_feature_extractor = FeatureExtractor(processed_pcl_img, "SIFT", args)

    if common_args['pcl_mask'] is True:
        print("pcl_mask: True")
        pcl_feature_extractor.compute(mask=processed_pcl_mask)
    else:
        print("No pcl_mask")
        pcl_feature_extractor.compute(mask=None)

    if common_args['drone_mask'] is True:
        print("drone_mask: True")
        drone_feature_extractor.compute(mask=processed_drone_mask)
    else:
        print('No drone_mask')
        drone_feature_extractor.compute(mask=None)

    drone_features, drone_descs = drone_feature_extractor.get_features_and_descriptors(
    )

    pcl_features, pcl_descs = pcl_feature_extractor.get_features_and_descriptors(
    )

    if common_args['save_keypoints'] is True:
        keypoints_lidar = cv2.drawKeypoints(
            pcl_img_ori,
            pcl_features,
            outImage=np.array([]),
            color=(0, 0, 255),
            flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
        keypoints_drone = cv2.drawKeypoints(
            drone_img_ori,
            drone_features,
            outImage=np.array([]),
            color=(0, 0, 255),
            flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
        result.update({'keypoints_lidar_image': keypoints_lidar})
        result.update({'keypoints_drone_image': keypoints_drone})

    # Find Matching
    matcher = Matcher(drone_features, drone_descs, pcl_features, pcl_descs,
                      args)
    matcher.extract_match()

    raw_matchs = matcher.get_matchs()
    good_matchs = matcher.get_good_matchs()

    # Find Homography
    homography, status = find_homography(drone_features, pcl_features,
                                         good_matchs, args)

    if common_args['save_csv'] is True:
        result.update({'drone_total_keypoints': len(drone_features)})
        result.update({'pcl_total_keypoints': len(pcl_features)})
        result.update({'num_inliers': (status.ravel().astype(int) == 1).sum()})
        result.update({'num_raw_matches': len(raw_matchs)})
        result.update({'num_good_matches': len(good_matchs)})
        result.update({'homography': homography})

    if common_args['save_matching'] is True:
        matching1 = matcher.draw_matches(processed_drone_img,
                                         processed_pcl_img, status, homography)
        matching2 = matcher.draw_matches(processed_drone_img,
                                         processed_pcl_img, None, homography)
        matching3 = matcher.draw_matches(processed_drone_img,
                                         processed_pcl_img)
        result.update({'matching1': matching1})
        result.update({'matching2': matching2})
        result.update({'matching3': matching3})

    registated_image = cv2.warpPerspective(
        drone_img_ori, homography,
        (processed_pcl_img.shape[1], processed_pcl_img.shape[0]))

    ret_image = cv2.add(registated_image,
                        cv2.cvtColor(processed_pcl_img, cv2.COLOR_GRAY2BGR))

    result.update({'image': ret_image})
    return result
        return np.mean(predicted - expected) ** 2


if __name__ == "__main__":
    vp = ViralityPrediction(normalize=True, balance=True, tweet_threshold=50000,
        score=False, dump_model=False)
    hashtagIndex = HashtagIndex()

    virality = {}
    hashtags_features = {}
    hashtags_virality = {}
    hashtags = [k for (k, v) in hashtagIndex.items(sort=True, descending=True, min_values=100)]
    print "Extracting features..."
    for hashtag in hashtags:
        _, featureList, vir = FeatureExtractor.loadFromDB(tweets_id=hashtagIndex.find(hashtag))
        hashtags_features[hashtag] = featureList
        hashtags_virality[hashtag] = vir
        virality[hashtag] = sum(np.array(vir)[:, 0])

    # Sort predictions by value and print top-K results
    predictions = vp.predict(hashtags_features)
    sorted_predictions = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)
    print "\nTop " + str(ViralityPrediction.K) + " virality predictions:"
    for i in range(0, min(ViralityPrediction.K, len(sorted_predictions))):
        print sorted_predictions[i]

    # Sort expected virality by value and print top-K results
    #sorted_virality = sorted(virality.items(), key=operator.itemgetter(1), reverse=True)
    #print "\nTop " + str(ViralityPrediction.K) + " virality expectations:"
    #for i in range(0, min(ViralityPrediction.K, len(sorted_virality))):
Example #18
0
        avg = float(sum(relavant_vals)) / len(points)
        if avg > 0: res[dimension] = avg
    return res


examples = []
laterexamples = []
vectorX = []
with open("Period 5 Rand.csv", 'rb') as file_reader:
    reader = csv.reader(file_reader, delimiter=",")
    counter = 0
    for line in reader:
        if counter == 5000: break  #to maintain manageable time
        counter += 1
        vectorX.append(line[1])
        laterObj = (line[0], FeatureExtractor(line[1]).featureVector())
        examples.append(laterObj[1])
        laterexamples.append(laterObj)

extractor = sklearn.feature_extraction.text.CountVectorizer(
    input='content', ngram_range=(2, 3), max_df=.7, stop_words='english')
X = extractor.fit_transform(vectorX)

distArr = []
inerArr = []
for k in range(1, 20):
    clusterer = sklearn.cluster.KMeans(n_clusters=k)
    res = clusterer.fit(X)
    print "Inertia with %d clusters is %d" % (k, clusterer.inertia_)
    inerArr.append(clusterer.inertia_)
    #build the dict-style clusters so I can use my same distortion function consistently across them
Example #19
0
    def createDataset(self, pathRaw, pathLabels, pathVectors,
                      pathCorrespondence):

        print("Loading spacy")
        nlp = spacy.load('en_core_web_md')
        print("loaded")
        labelIdx = 0

        #minWords = 9999
        #maxWords = 0
        #avgWords = 0

        listFiles = os.listdir(pathRaw)
        nTexts = len(listFiles)
        fdOutLabels = open(pathLabels, "w")
        fdOutCorrespondence = open(pathCorrespondence, "w")

        for i, fname in enumerate(listFiles):
            label = fname.split("_")[1]
            if label not in self.labelDict:
                self.labelDict[label] = labelIdx
                labelIdx += 1

            numeric_label = self.labelDict[label]
            fdOutLabels.write(str(numeric_label) + "\n")

            fd = open(pathRaw + fname, "r")
            raw = fd.read()
            iF = FeatureExtractor(raw, nlp)
            #FIRST SENTENCE ONLY NOW
            instanceVectors = []
            for wordDict in iF.features[0]:
                instanceVectors.append(wordDict["vector"])
                #only include maxWords vectors
                if len(instanceVectors) == self.maxLen:
                    break

            if len(instanceVectors) < self.maxLen:
                while len(instanceVectors) < self.maxLen:
                    instanceVectors.append(list(np.zeros(268)))
            '''
			nWords = len(instanceVectors)
			if nWords > maxWords:
				maxWords = nWords
			if nWords < minWords:
				minWords = nWords

			avgWords += nWords
			
			'''

            self.dataset.append(instanceVectors)
            fd.close()
            print(i, "of", nTexts)

        fdOutLabels.close()
        self.dataset = np.array(self.dataset)
        #save feature vectors per text
        np.save(pathVectors, self.dataset)

        fdOutCorrespondence.write(str(self.labelDict))
        fdOutCorrespondence.close()
        '''
Example #20
0
    def __init__(self,
                 corpdb=fwc.DEF_CORPDB,
                 corptable=fwc.DEF_CORPTABLE,
                 correl_field=fwc.DEF_CORREL_FIELD,
                 mysql_host="localhost",
                 message_field=fwc.DEF_MESSAGE_FIELD,
                 messageid_field=fwc.DEF_MESSAGEID_FIELD,
                 encoding=fwc.DEF_ENCODING,
                 use_unicode=fwc.DEF_UNICODE_SWITCH,
                 lexicondb=fwc.DEF_LEXICON_DB,
                 featureTable=fwc.DEF_FEAT_TABLE,
                 featNames=fwc.DEF_FEAT_NAMES,
                 date_field=fwc.DEF_DATE_FIELD,
                 outcome_table=fwc.DEF_OUTCOME_TABLE,
                 outcome_value_fields=[fwc.DEF_OUTCOME_FIELD],
                 outcome_controls=fwc.DEF_OUTCOME_CONTROLS,
                 outcome_interaction=fwc.DEF_OUTCOME_CONTROLS,
                 group_freq_thresh=None,
                 featureMappingTable='',
                 featureMappingLex='',
                 output_name='',
                 wordTable=None,
                 model=fwc.DEF_MODEL,
                 feature_selection='',
                 feature_selection_string='',
                 init=None):

        if feature_selection_string or feature_selection:
            RegressionPredictor.featureSelectionString = feature_selection if feature_selection else feature_selection_string

        if init:
            self.fw = FeatureWorker(
                corpdb, corptable, correl_field, mysql_host, message_field,
                messageid_field, encoding, use_unicode, lexicondb, date_field,
                wordTable) if 'fw' in init else None
            self.fg = FeatureGetter(
                corpdb, corptable, correl_field, mysql_host, message_field,
                messageid_field, encoding, use_unicode, lexicondb,
                featureTable, featNames, wordTable) if 'fg' in init else None
            self.fe = FeatureExtractor(
                corpdb,
                corptable,
                correl_field,
                mysql_host,
                message_field,
                messageid_field,
                encoding,
                use_unicode,
                lexicondb,
                wordTable=wordTable) if 'fe' in init else None
            self.fr = FeatureRefiner(
                corpdb, corptable, correl_field, mysql_host, message_field,
                messageid_field, encoding, use_unicode, lexicondb,
                featureTable, featNames, wordTable) if 'fr' in init else None
            self.og = OutcomeGetter(
                corpdb, corptable, correl_field, mysql_host, message_field,
                messageid_field, encoding, use_unicode, lexicondb,
                outcome_table, outcome_value_fields, outcome_controls,
                outcome_interaction, group_freq_thresh, featureMappingTable,
                featureMappingLex, wordTable) if 'og' in init else None
            self.oa = OutcomeAnalyzer(
                corpdb, corptable, correl_field, mysql_host, message_field,
                messageid_field, encoding, use_unicode, lexicondb,
                outcome_table, outcome_value_fields, outcome_controls,
                outcome_interaction, group_freq_thresh, featureMappingTable,
                featureMappingLex, output_name,
                wordTable) if 'oa' in init else None
            self.rp = RegressionPredictor(self.og, self.fg,
                                          model) if 'rp' in init else None
            self.cp = ClassifyPredictor(self.og, self.fg,
                                        model) if 'cp' in init else None

        else:
            self.fw = FeatureWorker(corpdb, corptable, correl_field,
                                    mysql_host, message_field, messageid_field,
                                    encoding, use_unicode, lexicondb,
                                    date_field, wordTable)
            self.fg = FeatureGetter(corpdb, corptable, correl_field,
                                    mysql_host, message_field, messageid_field,
                                    encoding, use_unicode, lexicondb,
                                    featureTable, featNames, wordTable)
            self.fe = FeatureExtractor(corpdb,
                                       corptable,
                                       correl_field,
                                       mysql_host,
                                       message_field,
                                       messageid_field,
                                       encoding,
                                       use_unicode,
                                       lexicondb,
                                       wordTable=wordTable)
            self.fr = FeatureRefiner(corpdb, corptable, correl_field,
                                     mysql_host, message_field,
                                     messageid_field, encoding, use_unicode,
                                     lexicondb, featureTable, featNames,
                                     wordTable)
            self.og = OutcomeGetter(corpdb, corptable, correl_field,
                                    mysql_host, message_field, messageid_field,
                                    encoding, use_unicode, lexicondb,
                                    outcome_table, outcome_value_fields,
                                    outcome_controls, outcome_interaction,
                                    group_freq_thresh, featureMappingTable,
                                    featureMappingLex, wordTable)
            self.oa = OutcomeAnalyzer(
                corpdb, corptable, correl_field, mysql_host, message_field,
                messageid_field, encoding, use_unicode, lexicondb,
                outcome_table, outcome_value_fields, outcome_controls,
                outcome_interaction, group_freq_thresh, featureMappingTable,
                featureMappingLex, output_name, wordTable)
            self.rp = RegressionPredictor(self.og, self.fg, model)
            self.cp = ClassifyPredictor(self.og, self.fg, model)

        self.allFW = {
            "FeatureWorker": self.fw,
            "FeatureGetter": self.fg,
            "FeatureExtractor": self.fe,
            "FeatureRefiner": self.fr,
            "OutcomeGetter": self.og,
            "OutcomeAnalyzer": self.oa,
            "RegressionPredictor": self.rp,
            "ClassifyPredictor": self.cp,
        }
Example #21
0
	def __init__(self, n):
		self.data = DataManager('../data/train.csv','../data/test.csv', n)
		self.fe = FeatureExtractor(self.data)
		self.eval = Evaluate()
                            balance=True,
                            tweet_threshold=50000,
                            score=False,
                            dump_model=False)
    hashtagIndex = HashtagIndex()

    virality = {}
    hashtags_features = {}
    hashtags_virality = {}
    hashtags = [
        k for (k, v) in hashtagIndex.items(
            sort=True, descending=True, min_values=100)
    ]
    print "Extracting features..."
    for hashtag in hashtags:
        _, featureList, vir = FeatureExtractor.loadFromDB(
            tweets_id=hashtagIndex.find(hashtag))
        hashtags_features[hashtag] = featureList
        hashtags_virality[hashtag] = vir
        virality[hashtag] = sum(np.array(vir)[:, 0])

    # Sort predictions by value and print top-K results
    predictions = vp.predict(hashtags_features)
    sorted_predictions = sorted(predictions.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
    print "\nTop " + str(ViralityPrediction.K) + " virality predictions:"
    for i in range(0, min(ViralityPrediction.K, len(sorted_predictions))):
        print sorted_predictions[i]

    # Sort expected virality by value and print top-K results
    #sorted_virality = sorted(virality.items(), key=operator.itemgetter(1), reverse=True)
Example #23
0
class Trip:
    def __init__(self, vehicle):
        print("Trip Starts")
        self.vehicle = vehicle
        self.timer = 0
        self.data = []
        self.featureExtractor = FeatureExtractor()
        self.tripNo = 1

    def end_trip(self, status, vehicles):
        ## Calculate the score
        score = self.getScore()
        print("Score", score)
        if status == "successful":
            ## Write data in a file
            np.save("trips/trip"+str(int(time.time())), self.data)
            #print("Data ", self.data)
        if status == "collided":
            ## Write data in a file
            np.save("trips/trip"+str(int(time.time())) + "_collided", self.data)
            #print("Data ", self.data)
        self.reset_trip(vehicles)
        return score

    def reset_trip(self, vehicles):
        print("Previous Trip was for ", self.timer)
        self.timer = 0
        self.tripNo += 1
        for vehicle in vehicles:
            vehicle.reset()
        self.data = []

    def getblob(self, x, y, w, h):
        buffer = (GLubyte * (3*w*h))(0)
        glReadPixels(x, y, w, h, GL_RGB, GL_UNSIGNED_BYTE, buffer)
        # Use PIL to convert raw RGB buffer and flip the right way up
        image = Image.frombytes(mode="RGB", size=(w, h), data=buffer)
        image = image.transpose(Image.FLIP_TOP_BOTTOM)
        img = np.array(image)
        #print(type(image), image.size)
        #image.save('jpap.png')
        return img

    def getFeatures(self, vehicles):
        #return self.vehicle.velocity
        blob = self.getblob(0, 0, 700, 850)
        return self.featureExtractor.sendFeaturesFrom(self.vehicle, blob, vehicles)

    def getScore(self):
        alpha1 = 0.1
        alpha2 = 0.1
        velocitiesX = []
        velocitiesY = []
        number_of_collisions = 0
        for dat in self.data:
            velocitiesX.append(dat[0])
            velocitiesY.append(dat[1])
            if dat[-2] != 0:
                number_of_collisions += 1
        #print("variance and number of collisions ", np.var(velocitiesX), np.var(velocitiesY), number_of_collisions)
        return alpha1*(np.var(velocitiesX) + np.var(velocitiesY))/2 + alpha2*number_of_collisions

    def check_trip_status_update(self, road, vehicles, control):
        ## If vehicle has finished movement or gone out of the screen end the trip and reset the trip
        is_in_collision = 0
        new_trip = False
        if self.out_of_road(road):
            print("Out of the Roads")
            score = self.end_trip("unsuccessful", vehicles)
            new_trip = True
        elif self.to_be_collided(vehicles, road):
            ## Change the color of the car or something like that.
            print("Collision")
            score = self.end_trip("collided", vehicles)
            new_trip = True
            is_in_collision = 1
        elif self.completed_trip():
            print("Completed Trip")
            score = self.end_trip("successful", vehicles)
            new_trip = True
        #if control != "No_Control":
        features = self.getFeatures(vehicles)
        if new_trip == False:
            score = self.getScore()

        features.append(score)
        features.append(is_in_collision)
        features.append(control)
        self.data.append(features)
        #print("features ", features)
        self.update_trip()
        return new_trip, score

    def completed_trip(self):
        #print("Completed Trip thing ", self.vehicle.pos)
        return self.vehicle.pos[0] > 10 and self.vehicle.pos[1] > -10 and self.vehicle.pos[1] < 10

    def to_be_collided(self, vehicles, road):
        ret = False
        for vehicle in vehicles:
            if vehicle.idno == self.vehicle.idno:
                continue
            v1_bottomx = self.vehicle.pos[0] - self.vehicle.size[0]/2
            v1_bottomy = self.vehicle.pos[1] - self.vehicle.size[1]/2
            v1_topx    = self.vehicle.pos[0] + self.vehicle.size[0]/2
            v1_topy    = self.vehicle.pos[1] + self.vehicle.size[1]/2
            v2_bottomx = vehicle.pos[0] - vehicle.size[0]/2
            v2_bottomy = vehicle.pos[1] - vehicle.size[1]/2
            v2_topx    = vehicle.pos[0] + vehicle.size[0]/2
            v2_topy    = vehicle.pos[1] + vehicle.size[1]/2
            if(not ((v1_topx < v2_bottomx or v2_topx < v1_bottomx) or (v1_topy + 1 < v2_bottomy or v2_topy + 1 < v1_bottomy))):
                ret = ret or True
                #print("Vehicle 1 and 2 ", v1_bottomx, v1_bottomy, v1_topx, v1_topy, v2_bottomx, v2_bottomy, v2_topx, v2_topy)
                #print("Colliding vehicles", vehicle.idno, self.vehicle.idno)
        return ret
    
    def out_of_road(self, road):
        ret = True
        for roadSegment in road.getBasicRoad():
            left_bottom = [roadSegment.x, roadSegment.y]
            right_top = [roadSegment.x + roadSegment.width, roadSegment.y + roadSegment.height]
            vehicle_posX = self.vehicle.pos[0]
            vehicle_posY = self.vehicle.pos[1]
            if ((vehicle_posX < left_bottom[0] or vehicle_posX > right_top[0]) or (vehicle_posY < left_bottom[1] or vehicle_posY > right_top[1])):
                ret = ret and True
            else:
                ret = ret and False
        return ret

    def update_trip(self):
        self.timer += 1
Example #24
0
## Adding the target vehicle
vehicles.append(
    Vehicle(4, (1, 2, 1), (-5, -40, 0), (1, 1, 0), 0, [0, 0.2, 0], False,
            "manual" if path_mode == 0 else "physics"))
#print("vehicles returned are ", vehicles)
actions = Actions()
road = Road()
time_units = 0
trip = Trip(vehicles[-1])
if path_constraints[path_mode] == "FREE":
    commander = nn_manual_commander()
else:
    commander = nn_manual_commander()
    #commander = physics_commander()

featureExtractor = FeatureExtractor()


def init_vehicles():
    ## Set of vehicles moving in the car.
    global level
    global trafficInjection
    global vehicles
    global time_units
    global trip
    global path_mode
    global commander
    if level == 0:
        trafficInjection = UniformTrafficInjection()
    else:
        trafficInjection = RandomTrafficInjection()
Example #25
0
        join(inputPath, f) for f in listdir(inputPath)
        if isfile(join(inputPath, f))
    ]
else:
    imagePaths = [
        inputPath,
    ]

print("loading images...")
Images = loadBatch(imagePaths)
print("loading SVM model...")
clf = joblib.load(svmPath)

print(
    "Extracting features, this may take a while for large collections of images..."
)
extractor = FeatureExtractor()
features = extractor.get_features(Images)

classes = clf.best_estimator_.classes_ if hasattr(
    clf, "best_estimator_") else clf.classes_
print("Predicting the Hieroglyph type...")
prob = np.array(clf.predict_proba(features))
top5_i = np.argsort(-prob)[:, 0:5]
top5_s = np.array(
    [prob[row, top5_i[row]] for row, top5_i_row in enumerate(top5_i)])
top5_n = classes[top5_i]

print("{:<25} ::: {}".format("image name", "top 5 best matching hieroglyphs"))
for idx, path in enumerate(imagePaths):
    print("{:<25} --> {}".format(os.path.basename(path), top5_n[idx]))
    else:
        featureSettings['polarity'] = set(
            i for i in args.polarityFeatures.split(','))

#elif args.featuresFile:
#    features = set(args.featuresFile.read().splitlines())
#else:
#    features = {}
#print(features)

if args.markersFile:
    featureSettings['markersFile'] = args.markersFile
    #hiddenFileName += '_markers'

print(featureSettings)
fe = FeatureExtractor(**featureSettings)

os.mkdir(args.session)
os.chdir(args.session)

if not args.dataDir:
    dataDir = '/local/nlp/chidey/social_meaning/aclImdb/'
else:
    dataDir = args.dataDir
main(dataDir, fe, '{}_{}'.format(args.session, 0), args.numtrain)

learnCommandTemplate = '/local/nlp/chidey/social_meaning/yessenalina/sle_movieReviews/bin/svm_sle_learn -v 3 -c {0} -l {1} {2}_{3}_{4} hidden_vars_{3}_{4}_{2} model_{3}_{4}'.format(
    args.c, args.l, '{0}', args.session, '{1}')
#2=train/validate/testfile
#3=session
#4=iteration
    def model_training(self):

        # check if the feature file is present, if so; there is no need to recompute the features
        # The pre-computed features can also be downloaded from http://iamai.nl/downloads/features.npy
        if not isfile(self.featurePath):
            print("indexing images...")
            Steles = [
                join(self.stelePath, f) for f in listdir(self.stelePath)
                if isdir(join(self.stelePath, f))
            ]
            for stele in Steles:
                imagePaths = [
                    join(stele, f) for f in listdir(stele)
                    if isfile(join(stele, f))
                ]
                for path in imagePaths:
                    self.image_paths.append(path)
                    self.labels.append(path[(path.rfind("_") +
                                             1):path.rfind(".")])

            featureExtractor = FeatureExtractor()
            features = []
            print("computing features...")
            for idx, (batch_images, _) in enumerate(
                    batchGenerator(self.image_paths, self.labels,
                                   self.batch_size)):
                print("{}/{}".format((idx + 1) * self.batch_size,
                                     len(self.labels)))
                features_ = featureExtractor.get_features(batch_images)
                features.append(features_)
            features = np.vstack(features)

            labels = np.asarray(self.labels)
            print("saving features...")
            np.save(self.featurePath, features)
            np.save(self.labelsPath, labels)
        else:
            print("loading precomputed features and labels from {} and {}".
                  format(self.featurePath, self.labelsPath))
            features = np.load(self.featurePath)
            labels = np.load(self.labelsPath)

        # on to the SVM trainign phase
        tobeDeleted = np.nonzero(
            labels == "UNKNOWN")  # Remove the Unknown class from the database
        features = np.delete(features, tobeDeleted, 0)
        labels = np.delete(labels, tobeDeleted, 0)
        numImages = len(labels)
        trainSet, testSet, trainLabels, testLabels = train_test_split(
            features, labels, test_size=0.20, random_state=42)

        # Training SVM, feel free to use linear SVM (or another classifier for that matter) for faster training, however that will not give the confidence scores that can be used to rank hieroglyphs
        print("training SVM...")
        if 0:  # optinal; either train 1 classifier fast, or search trough the parameter space by training multiple classifiers to sqeeze out that extra 2%
            clf = linear_model.LogisticRegression(C=10000)
        else:
            svr = linear_model.LogisticRegression()
            parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}
            clf = GridSearchCV(svr, parameters, n_jobs=8)
        clf.fit(trainSet, trainLabels)

        print(clf)
        print("finished training! saving...")
        joblib.dump(clf, self.svmPath, compress=1)

        prediction = clf.predict(testSet)
        accuracy = np.sum(testLabels == prediction) / float(len(prediction))

        # for idx, pred in enumerate(prediction):
        #     print("%-5s --> %s" % (testLabels[idx], pred))
        print("accuracy = {}%".format(accuracy * 100))
Example #28
0
		
		
	def predict(self, features):
		''' Predict importances '''
		features = self.norm.transform(features[:,0:self.n_comp])
		results = self.svr.predict(features)
		#print results[0:100:5]
		results = self.std_scaler_i.inverse_transform(results)
		#print results[0:100:5]
		return results

if __name__ == '__main__':
	import corpus, featureExtractor
	from featureExtractor import FeatureExtractor
	
	print 'Loading corpus ...'
	corpus = corpus.TwitterCorpus()
	tweets = corpus.all_tweets()
	importances = np.array([featureExtractor.tweet_importance(t) for t in tweets])
	
	# try to load feature vectors
	try: v = joblib.load('data/cache/vectors.joblib')
	except:
		print 'FeatureExtractor fit transform ...'
		feat = FeatureExtractor()
		v = feat.train(tweets, importances)
		joblib.dump(v, 'data/cache/vectors.joblib')

	print 'HotTweets train ...'
	ht = HotTweets()
	ht.train(v[0:1000], importances[0:1000])
Example #29
0
class StanceDetector:
	def __init__(self, n):
		self.data = DataManager('../data/train.csv','../data/test.csv', n)
		self.fe = FeatureExtractor(self.data)
		self.eval = Evaluate()

	def buildBaseline(self, model):
		print 'Training baseline',model
		feats = ['words']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		for mode in ['simple','tfidf']:
			if model=='bayes':
				cl = MultinomialNB()
			elif model=='svm':
				cl = LinearSVC()

			if mode=='tfidf':
				cl = Pipeline([('tfidf', TfidfTransformer()),
					  ('clf', cl), ])

			clf = cl.fit(X, y)
			y_pred = clf.predict(X_test)
			print mode, accuracy_score(y_true, y_pred)
			pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))


	def buildSimple(self, model):
		feats = ['topicVecs','words2vec']
		print feats
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		for mode in ['simple']:#,'tfidf']:
			if model=='bayes':
				cl = MultinomialNB()
			elif model=='svm':
				# cl = LinearSVC()
				cl = LinearSVC()
				cl = GridSearchCV(cl, self.getGridSearchParams())

			if mode=='tfidf':
				cl = Pipeline([('tfidf', TfidfTransformer()),
					  ('clf', cl), ])
			
			clf = cl.fit(X, y)
			# print cl.best_params_
			y_pred = clf.predict(X_test)
			print mode, accuracy_score(y_true, y_pred)
			pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	#train in name means helper function
	def trainSVC(self, feats, y_attribute, proba=False):
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		
		clf = SVC(probability=proba)
		clf = clf.fit(X,y)
		if proba:
			y_proba = clf.predict_proba(X_test)
			return clf, y_proba
		else:
			y_pr = clf.predict(X_test)
			return clf, y_pr
	
	def trainLinearSVC(self, feats, y_attribute, dec=False):
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		
		clf = LinearSVC()
		clf = clf.fit(X,y)
		if dec:
			y_pr = clf.decision_function(X_test)
			return clf, y_pr
		else:
			y_pr = clf.predict(X_test)
			return clf, y_pr

	#TODO: revisit
	#check lable transform encodings of NONE, FAVOR, AGAINST
	# def buildTopicStanceSeparate(self):
	# 	feats = ['words']
	# 	y_attribute = 'stance'
	# 	X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)

	# 	#builds two separate for topic and stance
	# 	topic_clf, y_topic_proba = self.trainLinearSVC(feats = ['words','lexiconsbyword'],y_attribute = 'topic',dec=True)
		
	# 	#WRONR
	# 	#WRONG
	# 	#WRONG
	# 	boost_factors = np.ones_like(y_true)
	# 	#multiply by NONE (0) = 0
	# 	#multiply by FAVOR (1) = 1
	# 	#multiply by AGAINST (2) = 2

	# 	#has index of class with max prob for each sample
	# 	topic_preds = np.argmax(y_topic_proba,axis=1)
	# 	for ind,s in enumerate(y_topic_proba):
	# 		prob = y_topic_proba[ind][topic_preds[ind]]
	# 		if prob < 0.4:
	# 			boost_factors[ind] = 0 #corresponds to NONE
		
	# 	stance_clf,stance_pred = self.trainLinearSVC(feats = ['words','lexiconsbyword','topic'],y_attribute = 'stance')		
		
	# 	# for i in range(0, len(stance_pred)):
	# 	# 	if boost_factors[i] == 2:
	# 	# 		stance_pred[i] = self.fe.labelenc.transform("NONE")
		
	# 	#with numpy arrays now, above is equivalent to below , right?
	# 	stance_pred = np.multiply(stance_pred, boost_factors)
	# 	stance_pred_labels = self.fe.labelenc.inverse_transform(stance_pred)

	# 	# print [(self.data.testLabels[i], stance_pred_labels[i]) for i in range(len(stance_pred))]
	# 	score = accuracy_score(y_true, stance_pred)
	# 	print score
	# 	pprint(self.eval.computeFscores(self.data.testTweets, stance_pred_labels))

	def buildTopicOnlyMultiple(self):
		#one svm for each topic
		feats = ['words2vec']
		y_attribute = 'stance'
		clf_topic = {}
		for topic in list(self.fe.topicenc.classes_):
			X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute, topic)
			Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute, topic)
			clf = LinearSVC()
			clf = clf.fit(X,y)
			clf_topic[topic] = clf
			print topic, clf.score(Xt,yt)

		# not useful. still less than single SVM. but not as much as avg of above

		# X_whole,y_whole = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		# Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		# newX = []
		# newXt = []
		# for topic in clf_topic:
		# 	newX.append(clf_topic[topic].transform(X_whole))
		# 	newXt.append(clf_topic[topic].transform(Xt))
		# newX = np.concatenate(tuple(newX),axis=1)
		# newXt = np.concatenate(tuple(newXt),axis=1)
		# newclf = LinearSVC()
		# newclf = newclf.fit(newX, y_whole)
		# print newclf.score(newXt, yt)

	def trainTopicSVM(self, topic):
		feats = ['words2vec','clusteredLexicons','topic1hot']
		y_attribute = 'stance'
		
		X,y = self.fe.getFeaturesTopicNontopic('train',feats,y_attribute, topic=topic)
		X_test,y_true = self.fe.getFeaturesTopicNontopic('test',feats,y_attribute, topic=topic)
		clf = LinearSVC()
		clf = GridSearchCV(clf,self.getGridSearchParams())
		clf = clf.fit(X,y)
		print clf.best_params_
		print topic #,clf.score(X_test, y_true)
		return clf
	
	#WRITE
	#WRITE
	#WRITE
	def buildTopicWise(self):
		#separate SVC for each topic, tests on that class only first, then on all
		topic_clf = {}
		feats = ['words2vec','clusteredLexicons','topic1hot']
		
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)

		#X matrix for new classifier which uses this as train matrix
		#has columns of each topic classifier's confidence function
		# X_fx = []
		# X_ftestx = []
		preds = []
		for topic in list(self.fe.topicenc.classes_):
			topic_clf[topic] = self.trainTopicSVM(topic)
			preds.append(topic_clf[topic].predict(X_test))
			# X_fx.append(topic_clf[topic].decision_function(X))
			# X_ftestx.append(topic_clf[topic].decision_function(X_test))

		allpreds = np.vstack(tuple(preds))
		topic1hot, temp = self.fe.getFeaturesMatrix('test',['topic1hot'],'stance')
		# print allpreds.shape, topic1hot.T.shape
		allpreds[allpreds==5] = 1
		final_pred = np.multiply(topic1hot.T,allpreds)

		prediction = np.sum(final_pred, axis=0).astype(int)
		# X_fx = np.concatenate(tuple(X_fx), axis=1)
		# X_ftestx = np.concatenate(tuple(X_ftestx), axis=1)
		# clf = LinearSVC().fit(X_fx, y)
		# y_pred = clf.predict(X_ftestx)
		print accuracy_score(y_true, prediction)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(prediction)))


	#GOOD 66%acc
	#1.2 % increase with change topic to 1hot
	def buildSVMWord2Vec(self):
		feats = ['words2vec','topic1hot']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		clf = LinearSVC(C=0.01,penalty='l1',dual=False)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	def buildSVMTrial(self):
		feats = ['topic1hot','words2vec']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)		
		clf = LinearSVC(C=0.001)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
	
	def buildTrial(self):
		# feats = ['pos','words2vec','clusteredLexicons','topic1hot']
		# 'givenSentiment','givenOpinion'
		feats = ['words2vec','pos','clusteredLexicons','top1grams','top2grams']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)		
		# clf = DecisionTreeClassifier()
		# clf = LogisticRegression()
		clf = LinearSVC(C=1, class_weight='balanced', penalty='l1',dual=False)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		# print y_pred
		print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]),len(np.where(y_pred>1)[0])
		print len(y_pred)
		print 'training accuracy',clf.score(X, y)
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	def buildGithubSGDModel(self):
		# feats = ['words2vec','topic1hot','pos']
		y_attribute = 'stance'
		dataset = self.fe.getDataset('train')
		dataset2 = self.fe.getDataset('test')
		y_train = self.fe.getY('train',dataset, y_attribute)
		y_test = self.fe.getY('train',dataset2, y_attribute)

		tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=1.0, min_df=1, binary=True, norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=True, encoding='latin1')
		
		X_train = tfidf.fit_transform(self.data.trainTweetsText)
		X_test = tfidf.transform(self.data.testTweetsText)
		tuned_parameters = {'alpha': [10 ** a for a in range(-12, 0)]}
		clf = GridSearchCV(SGDClassifier(loss='hinge', penalty='elasticnet',l1_ratio=0.75, n_iter=10, shuffle=True, verbose=False, n_jobs=4, average=False)
                      , tuned_parameters, cv=10, scoring='f1_weighted')

		clf.fit(X_train, y_train)
		print clf.best_params_
		print("Grid scores on development set:")
		for params, mean_score, scores in clf.grid_scores_:
			print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
		print classification_report(y_test, clf.predict(X_test))
		print clf.score(X_test, y_test)


	def getGridSearchParams(self):
		param_grid = [
				{'C': [0.001, 0.01, 0.1, 1], 'dual':[False, True],'class_weight':['balanced',None]}
		 ]
		return param_grid

	def getGridSearchParamsForXGBoost(self):
		param_grid = [
			{'n_estimators':[10,20,30,40,50], 'max_depth': [1,2,3,4,5]}
		]

	def buildSVMWord2VecWithClusters(self):
		#feats = ['topic1hot']
		#feats = ['words2vec', 'top1grams', 'top2grams']
		#feats = ['words2vec', 'top1grams']
		#feats = ['words2vec', 'top2grams']
		feats = ['words2vec', 'clusteredLexicons', 'topic1hot', 'pos']
		#feats = ['words2vec','topic1hot', 'pos','clusteredLexicons', 'top2grams']
		#feats = ['clusteredLexicons']
		#feats = ['pos']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		print (X.shape)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		clf = LinearSVC(C=1,penalty='l1',dual=False)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		# f = open('pred','w')
		# for i in y_pred:
		# 	#print type(i)
		# 	f.write('{0}'.format(i))
		# f.close()
		accuracy = clf.score(Xt, yt)
		# print clf.score(Xt, yt)
		fscores = self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))
		# print type(fscores)
		# print fscores
		# pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		# print (accuracy, fscores['Macro'])
		return (accuracy, fscores['Macro'])

	def buildSVMWord2VecWithClustersGridSearch(self):
		feats = ['words2vec','topic1hot','pos', 'clusteredLexicons']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		
		svmclf = LinearSVC(C=0.01,penalty='l1',dual=False)
		clf = GridSearchCV(svmclf, self.getGridSearchParams())
		clf = clf.fit(X,y)
		print clf.best_params_

		y_pred = clf.predict(Xt)
		
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	def trainStanceNone(self, feats):
		# feats = ['words2vec','topic1hot','pos']
		X,y = self.fe.getFeaturesStanceNone('train',feats)
		Xt,yt = self.fe.getFeaturesStanceNone('test',feats)
		svmclf = LinearSVC()
		stance_none_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y)
		# print stance_none_clf.score(Xt, yt)
		pred = stance_none_clf.predict(Xt)
		print classification_report(yt, pred)
		return stance_none_clf

	def trainFavorAgainst(self,feats):
		# feats = ['words2vec','topic1hot','pos']
		X,y = self.fe.getFeaturesFavorAgainst('train',feats)
		Xt,yt = self.fe.getFeaturesFavorAgainst('test',feats)
		svmclf = LinearSVC()
		fav_agnst_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y)
		pred = fav_agnst_clf.predict(Xt)
		print classification_report(yt, pred)

		# print fav_agnst_clf.score(Xt, yt)
		return fav_agnst_clf

	def buildModel2(self):
		#one SVM for Stance/None and other for Favor/Against
		feats = ['words2vec','topic1hot','pos']
		print feats
		stance_none_clf = self.trainStanceNone(feats)
		fav_agnst_clf = self.trainFavorAgainst(feats)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,'stance')
		st_pred = stance_none_clf.predict(X_test)
		favaga_pred = fav_agnst_clf.predict(X_test)
		for index,row in enumerate(st_pred):
			if row==3:
				st_pred[index] = favaga_pred[index]
		print classification_report(y_true, st_pred)
		print accuracy_score(y_true, st_pred)
		# assert(stance_none_clf.classes_[1]==3) #stance(3)
		# # >0 means this class - stance will be predicted
		# # <0 means none is predicted
		# confi = stance_none_clf.decision_function(X_test)
		# # treat as confident about none if confi<-0.25:
		# y_pred = fav_agnst_clf.predict(X_test)
		# print accuracy_score(y_true, y_pred)
		# pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		# threshold = -0.25
		# confi_high = np.where(confi<threshold)[0]
		# for loc in confi_high:
		# 	y_pred[loc] = self.fe.labelenc.transform('NONE')
		# print 'Boosted', accuracy_score(y_true, y_pred)
		# print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]), len(np.where(y_pred==2)[0]),
		# pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		
	def get_proba_one(self, model, X):
	    predicted = model.predict_proba(X)
	    return predicted[:, 1]

	def runXGBoostModel(self,model, model_name, X, target, X_test, crossOn):
		print "Trying to fit model"
		print X.shape, target.shape
		model.fit(X, target)
		print "Successfully fit model"
		predicted = self.get_proba_one(model, X)
		predicted_test = self.get_proba_one(model, X_test)
		predicted_test = model.predict(X_test)
		print predicted_test
		return predicted_test


	def word2VecXGBoost(self):
		feats = ['words2vec','pos','clusteredLexicons', 'top1grams','top2grams', 'topic1hot' ]
		#feats = ['words2vec']
		#feats = ['clusteredLexicons']
		#feats = ['pos']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		print (X.shape)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		#clf = LinearSVC(C=0.01,penalty='l1',dual=False)
		#clf = clf.fit(X,y)
		#y_pred = clf.predict(Xt)
		# f = open('pred','w')
		# for i in y_pred:
		# 	#print type(i)
		# 	f.write('{0}'.format(i))
		# f.close()
		#print clf.score(Xt, yt)
		#pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		m2_xgb = xgb.XGBClassifier(n_estimators=10, nthread=-1, max_depth = 2 	, seed=500)
		#m2_xgb = GridSearchCV(m2_xgb, self.getGridSearchParamsForXGBoost())
		print "Run Model"
		y_pred = self.runXGBoostModel(m2_xgb, "m2_xgb_OS_ENN", X, y, Xt, True)
		# print type(yt)
		# print type(y_pred)
		# print len(yt)
		# print len(y_pred)
		# print yt.shape
		# print y_pred.shape
		# print yt
		# print y_pred
		# print(m2_xgb)
		print accuracy_score(yt, y_pred)

	def buildModel3(self):
		#feats = [['words2vec'],['pos'],['clusteredLexicons']]
		feats = [['words2vec'],['pos'],['clusteredLexicons']]
		y_attribute = 'stance'
		y_pred = []
		y_t = []
		for f in feats:
			X,y = self.fe.getFeaturesMatrix('train',f,y_attribute)
			Xt,yt = self.fe.getFeaturesMatrix('test',f,y_attribute)
			clf = SVC(C=1, probability=True)
			clf = clf.fit(X,y)
			train_transform = clf.predict_log_proba(X)
			test_transform = clf.predict_log_proba(Xt)
			# print 'Train transform ',train_transform.shape
			# print 'Test transform ',test_transform.shape
			y_pred.append(train_transform)
			y_t.append(test_transform)
		#y_pred_h = np.hstack(tuple(y_pred))
		#y_t_h = np.hstack(tuple(y_t))
		x = 0
		for i in y_pred:
			   x += i
		y_pred_h = x
		x = 0
		for i in y_t:
			   x += i
		y_t_h = x
		# print type(y_pred_h)
		# print y_pred_h[0]
		# print y_pred_h.shape
		regr = linear_model.LogisticRegression()
		regr.fit(y_pred_h, y)
		final_pred = regr.predict(y_t_h)
		print accuracy_score(final_pred, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(final_pred)))