def __init__(self, name, epsilon=0.05, gamma=0.8, alpha=0.2, numTraining=0): self.name = name self.cards = [] #(cardValue, cardElement) self.accumulatedCards = {"Fire": 0, "Water": 0, "Ice": 0} self.playedCard = None self.args = {} self.args['epsilon'] = epsilon self.args['gamma'] = gamma self.args['alpha'] = alpha self.args['numTraining'] = numTraining self.weights = Counter() # self.weights["enemy-distance-to-closest-win"] = 1.3999995454998298e-06 # self.weights["agent-distance-to-closest-win"] = 1.299999463999758e-06 self.weights["enemy-distance-to-closest-win"] = -4.120535635213156 self.weights["agent-distance-to-closest-win"] = 9.586679017815417 self.weights["agent-went-closer-to-win"] = -0.9656494587969497 self.weights["agent-can-block-enemy-advancement"] = 15.147299275663869 self.featExtractor = FeatureExtractor() self.lastState = None self.lastAction = None self.lastScore = 0
def __init__(self, vehicle): print("Trip Starts") self.vehicle = vehicle self.timer = 0 self.data = [] self.featureExtractor = FeatureExtractor() self.tripNo = 1
def load_datasets(balance=False, viral_threshold=0): """ Return the training and testing datasets containing the tweets featured followed by the retweet count """ # Import data _, features, virality = FeatureExtractor.load(force=True) print "Building datasets..." # Concatenate the arrays into one along the second axis data = np.c_[features, np.array(virality)[:, 0]] RegressionModel.__dataset_range(data) # Duplicate viral tweets to balance the dataset if balance: data = RegressionModel.__balance_virality( dataset=data, threshold=viral_threshold) # Shuffle data np.random.shuffle(data) # Split dataset into training and testing sets size = int(len(data) * RegressionModel.TRAINING_SIZE) # why was the test set having overlap with the traning set earlier. training_set = data[:size] testing_set = data[size:] return training_set, testing_set
def load_datasets(balance=False, viral_threshold=0): """ Return the training and testing datasets containing the tweets featured followed by the retweet count """ # Import data _, features, virality = FeatureExtractor.load(force=True) print "Building datasets..." # Concatenate the arrays into one along the second axis data = np.c_[features, np.array(virality)[:, 0]] RegressionModel.__dataset_range(data) # Duplicate viral tweets to balance the dataset if balance: data = RegressionModel.__balance_virality(dataset=data, threshold=viral_threshold) # Shuffle data np.random.shuffle(data) # Split dataset into training and testing sets size = int(len(data) * RegressionModel.TRAINING_SIZE) # why was the test set having overlap with the traning set earlier. training_set = data[:size] testing_set = data[size:] return training_set, testing_set
def testExtration(self): featureExt = FeatureExtractor() agent = Player("aql agent") enemy = Player("greedy agent") gameState = GameState(agent, enemy) enemy.accumulatedCards["Water"] += 1 enemy.accumulatedCards["Fire"] += 1 features = featureExt.getFeatures(gameState, "action", agent.name) self.assertEqual(features["enemy-distance-to-closest-win"], 1) self.assertEqual(features["agent-distance-to-closest-win"], 4) agent.cards.append((1, "Water")) enemy.accumulatedCards["Fire"] -= 1 enemy.accumulatedCards["Water"] += 1 features = featureExt.getFeatures(gameState, "action", agent.name) self.assertEqual(features["agent-distance-to-closest-win"], 3) self.assertEqual(features["enemy-distance-to-closest-win"], 1)
def load_datasets(balance=False, viral_threshold=0): """ Return the datasets containing the tweets featured followed by the retweet count """ # Import data _, features, virality = FeatureExtractor.load(force=True, keepTweetWithoutHashtags=False) print "Building datasets..." # Concatenate the arrays into one along the second axis data = np.c_[features, np.array(virality)[:, 0]] return pd.DataFrame(data, columns=(FeatureExtractor.FEATURE_LABEL + FeatureExtractor.VIRALITY_LABEL))
def load_datasets(balance=False, viral_threshold=0): """ Return the datasets containing the tweets featured followed by the retweet count """ # Import data _, features, virality = FeatureExtractor.load(force=True, keepTweetWithoutHashtags=False) print "Building datasets..." # Concatenate the arrays into one along the second axis data = np.c_[features, np.array(virality)[:, 0]] return pd.DataFrame(data, columns=(FeatureExtractor.FEATURE_LABEL+FeatureExtractor.VIRALITY_LABEL))
def create_generators(): # creating training generator preloader = MatrixPreLoader(dataset_directory=training_filepath, patients_to_use="ALL", activity_types=activities_to_load, print_loading_progress=False) matrix_data_generator = MatrixDataGenerator(preloader, matrix_dimensions=(224, 224), rgb=True, twoD=False, add_gaussian_noise=0, zero_sensors=0, batch_size=32, grab_data_from=(0, 1), overflow="BEFORE", print_loading_progress=False) training_generator = FeatureExtractor(matrix_data_generator, patient_fall_filepath, weigths_filepath, test=True) # create testing generator preloader = MatrixPreLoader(dataset_directory=testing_filepath, patients_to_use="ALL", activity_types=activities_to_load, print_loading_progress=False) matrix_data_generator = MatrixDataGenerator(preloader, matrix_dimensions=(224, 224), rgb=True, twoD=False, add_gaussian_noise=0, zero_sensors=3, batch_size=50, grab_data_from=(0, 1), overflow="BEFORE", print_loading_progress=False) testing_generator = FeatureExtractor(matrix_data_generator, patient_fall_filepath, weigths_filepath, test=False) return training_generator, testing_generator
def extractFeatures(self, train_data, test_data): # Construct Feature Extractor fe = FeatureExtractor() fe.buildVectorizer(train_data, self.config['featureKwargs']) # Make feature path if it doesnt exist if not os.path.exists(self.feature_path): os.mkdir(self.feature_path) # Check if train vectors already exist if os.path.exists(os.path.join(self.feature_path, 'train_vectors.npz')): # If it does, load them train_vectors = load_npz(os.path.join(self.feature_path, 'train_vectors.npz')) else: # Make the train vectors train_vectors = [fe.process(feature, train_data) for feature in self.config['features']] if len(train_vectors) > 1: train_vectors = numpy.concatenate(train_vectors, axis=1) else: train_vectors = train_vectors[0] # Save the train vectors save_npz(os.path.join(self.feature_path, 'train_vectors.npz'), train_vectors) # Check if test vectors already exist if os.path.exists(os.path.join(self.feature_path, 'test_vectors.npz')): # If it does, load them test_vectors = load_npz(os.path.join(self.feature_path, 'test_vectors.npz')) else: # Make the test vectors test_vectors = [fe.process(feature, test_data) for feature in self.config['features']] if len(test_vectors) > 1: test_vectors = numpy.concatenate(test_vectors, axis=1) else: test_vectors = test_vectors[0] # Save the test vectors save_npz(os.path.join(self.feature_path, 'test_vectors.npz'), test_vectors) return train_vectors, test_vectors
def func4(self): vp = ViralityPrediction(normalize=True, balance=True, tweet_threshold=50000, score=False, dump_model=False) hashtagIndex = HashtagIndex() virality = {} hashtags_features = {} hashtags_virality = {} hashtags = [ k for (k, v) in hashtagIndex.items( sort=True, descending=True, min_values=100) ] print "Extracting features..." for hashtag in hashtags: _, featureList, vir = FeatureExtractor.loadFromDB( tweets_id=hashtagIndex.find(hashtag)) hashtags_features[hashtag] = featureList hashtags_virality[hashtag] = vir virality[hashtag] = sum(np.array(vir)[:, 0]) # Sort predictions by value and print top-K results predictions = vp.predict(hashtags_features) sorted_predictions = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True) print "\nTop " + str(ViralityPrediction.K) + " virality predictions:" print sorted_predictions for i in range(0, min(ViralityPrediction.K, len(sorted_predictions))): print sorted_predictions[i] listbox.insert(END, sorted_predictions[i]) listbox.pack()
def extractFeatures(self, train_data, test_data): #Extract Features and pass them as concatenated arrays fe = FeatureExtractor(self.config['features'], self.config['featurePath'], self.config['featureKwargs']) fe.buildVectorizer(train_data) #Check for ALready done work if path.exists(self.config['featurePath'] + "train_data.pickle"): print("here's the error?") with open(self.config['featurePath'] + "train_data.pickle", "rb") as file: train_vectors = pickle.load(file) else: train_vectors = fe.process(train_data) with open(self.config['featurePath'] + "train_data.pickle", "wb+") as file: pickle.dump(train_vectors, file) if len(train_vectors) > 1: print("took option A") train_vectors = numpy.concatenate(train_vectors, axis=1) else: print("took option B") train_vectors = train_vectors[0] print(train_vectors.shape) print(train_vectors[1, :]) #Check for ALready done work if path.exists(self.config['featurePath'] + "test_data.pickle"): with open(self.config['featurePath'] + "test_data.pickle", "rb") as file: test_vectors = pickle.load(file) else: test_vectors = fe.process(test_data) with open(self.config['featurePath'] + "test_data.pickle", "wb+") as file: pickle.dump(test_vectors, file) if len(test_vectors) > 1: test_vectors = numpy.concatenate(test_vectors, axis=1) else: test_vectors = test_vectors[0] return train_vectors.toarray(), test_vectors.toarray()
# The pre-computed features can also be downloiad from http://iamai.nl/downloads/features.npy if not isfile(featurePath): print("indexing images...") Steles = [ join(stelePath, f) for f in listdir(stelePath) if isdir(join(stelePath, f)) ] for stele in Steles: imagePaths = [ join(stele, f) for f in listdir(stele) if isfile(join(stele, f)) ] for path in imagePaths: image_paths.append(path) labels.append(path[(path.rfind("_") + 1):path.rfind(".")]) featureExtractor = FeatureExtractor() features = [] print("computing features...") for idx, (batch_images, _) in enumerate(batchGenerator(image_paths, labels, batch_size)): print("{}/{}".format((idx + 1) * batch_size, len(labels))) features_ = featureExtractor.get_features(batch_images) features.append(features_) features = np.vstack(features) labels = np.asarray(labels) print("saving features...") np.save(featurePath, features) np.save(labelsPath, labels) else: print("loading precomputed features and labels from {} and {}".format(
from NeuralNetwork import NeuralNetwork from featureExtractor import FeatureExtractor import numpy as np from DataLoader import DataLoader import configure #best results: #(0.05, 60, 30, 30) #0.5428571428571428 #(0.1, 50, 60, 30) #0.5714285714285714 #(0.1, 300, 100, 30) #0.6 fe = FeatureExtractor("generatedData/eigenfaces.csv", "generatedData/average_face.csv") best = 0.0 for lr in range(4, 7, 1): for ne in range(380, 421, 5): for nhn in range(40, 91, 10): for nev in range(50, 91, 10): configure.setUpConfig(lr / 100, ne, nhn, nev) #prepare data for training: dl = DataLoader(configure.config_global.modeTrain) dl.load_all_images() datasetTrain = fe.generate_dataset(dl.images) #train NN: nn = NeuralNetwork(configure.config_global.noOfEigenValues, configure.config_global.noOfHidNeur) nn.trainNetwork(datasetTrain)
class ApproximateQLearningAgent(Player): def __init__(self, name, epsilon=0.05, gamma=0.8, alpha=0.2, numTraining=0): self.name = name self.cards = [] #(cardValue, cardElement) self.accumulatedCards = {"Fire": 0, "Water": 0, "Ice": 0} self.playedCard = None self.args = {} self.args['epsilon'] = epsilon self.args['gamma'] = gamma self.args['alpha'] = alpha self.args['numTraining'] = numTraining self.weights = Counter() # self.weights["enemy-distance-to-closest-win"] = 1.3999995454998298e-06 # self.weights["agent-distance-to-closest-win"] = 1.299999463999758e-06 self.weights["enemy-distance-to-closest-win"] = -4.120535635213156 self.weights["agent-distance-to-closest-win"] = 9.586679017815417 self.weights["agent-went-closer-to-win"] = -0.9656494587969497 self.weights["agent-can-block-enemy-advancement"] = 15.147299275663869 self.featExtractor = FeatureExtractor() self.lastState = None self.lastAction = None self.lastScore = 0 def resetForNewGame(self): self.cards = [] self.accumulatedCards = {"Fire": 0, "Water": 0, "Ice": 0} self.playedCard = None self.lastState = None self.lastAction = None self.lastScore = 0 def pickCard(self, card): # Takes in the index of the card to be picked in self.cards if card not in self.cards: print(card) print(self.cards) raise ValueError('Picked card not in current cards!') self.playedCard = card self.cards.remove(card) return def getLegalActions(self, gameState): if self.name == gameState.p1.name: return gameState.p1.cards else: return gameState.p2.cards return self.cards def getQValue(self, gameState, action): """ Should return Q(gameState,action) = w * featureVector where * is the dotProduct operator """ result = 0 features = self.featExtractor.getFeatures(gameState, action, self.name) for feature in features: result += features[feature] * self.weights[feature] return result def flipCoin(self, prob): r = random.random() return r < prob def computeActionFromQValues(self, gameState): actions = self.getLegalActions(gameState) max_action = None max_q_val = float("inf") if not actions: return None for a in actions: q_val = self.getQValue(gameState, a) if q_val < max_q_val: max_q_val = q_val max_action = a if max_action is None: return random.choice(actions) else: return max_action def doAction(self, gameState): legalActions = self.getLegalActions(gameState) action = None "*** YOUR CODE HERE ***" # if not legalActions: # action = None # else: coinflip = self.flipCoin(self.args["epsilon"]) if coinflip: action = random.choice(legalActions) elif not coinflip: action = self.computeActionFromQValues(gameState) # print("setting lastState") self.lastState = copy.deepcopy(gameState) # print("self.lastState",self.lastState) self.lastAction = copy.deepcopy(action) return action def update(self, gameState, score): """ Should update your weights based on transition """ # deleted action, nextState, reward from params # print("self.lastState",self.lastState) if self.lastState is not None: state, action, nextState, deltaReward = self.lastState, self.lastAction, gameState, score - self.lastScore actions = self.getLegalActions(nextState) max_qval_action = (float("-inf"), None) if not actions: max_qval_action = 0 elif actions: for a in actions: #! going through all the actions to get max action q_val = self.getQValue(nextState, a) max_qval_action = max(max_qval_action, (q_val, a), key=lambda x: x[0]) difference = deltaReward + self.args['gamma'] * max_qval_action[0] - self.getQValue(state, action) features = self.featExtractor.getFeatures(state, action, self.name) for feature in features: # print(feature, "weights:", self.weights[feature]) self.weights[feature] = self.weights[feature] + self.args['alpha'] * difference * features[feature] self.lastScore = score def printEpisodeInfo(self): print(bcolors.OKBLUE + "AQL score:", str(self.lastScore)) print(bcolors.OKBLUE + "AQL accumulated cards:", str(self.accumulatedCards)) # print(bcolors.OKBLUE + "AQL weights:") # for key, value in self.weights.items(): # print(" ",key, value) print( bcolors.ENDC)
def main(): """" Preprocesses, extracts, learns, tests""" # process flags do_retrain, do_rebuildValidation, do_test = False, False, False for arg in sys.argv[1:]: if ("--retrain" in arg): if ("yes" in arg): do_retrain = True if ("--rebuildValidation" in arg): if ("yes" in arg): do_rebuildValidation = True if ("--test" in arg): if ("yes" in arg): do_test = True # preprocessing do = DataOrganizer() # __________________________________ TRAINING ________________________ # # use BoG to convert to frequency vector fe = FeatureExtractor(FeatureExtractor.ModelType.BagOfClusters) clf = 0 clf_file = "" # get the latest trained model filenames = os.listdir("models/") if len(filenames) > 0: clf_file = "models/" + filenames[-1] else: clf_file = None # get sets of tweets as training data # trainData0, trainData1, validation0, validation1 \ # = do.organizeTrainWithValidation("data/trainValidate/", do_rebuildValidation) trainData0, trainData1 = do.organizeTrain("data/train/") if do_retrain or not clf_file: # split training set into validation and training set X0, X1 = fe.extractTrainFeatureVectors((trainData0, trainData1)) clf = learn(X0, X1) millis = int(round(time.time() * 1000)) clf_file = "trainedModel" + str(millis) print "Saving model to file..." joblib.dump(clf, "models/" + clf_file, compress=1) else: print "Using trained model and BoG..." fe.bog = BagOfWords() fe.bog.getLatestBoG() clf = joblib.load(clf_file) # we're either validating or testing based on the passed flag # ____________________________________VALIDATION__________________________# if not do_test: # feed in the validation sets as one set validationData = do.organizeTest("data/validation/") validationFeatures, validationLabels = fe.extractTestFeatureVectors( validationData) test("Validation", clf, validationFeatures, validationLabels) else: # ____________________________________TESTING _______________________ # # extract test features and test print "Using testing" testData, testLabels = do.organizeTest("data/test/") testFeatures = fe.extractTestFeatureVectors(testData) test("Testing, Global Protests With Background Subtraction", clf, testFeatures, testLabels)
def registrate(drone_img_ori, pcl_img_ori, mask_image, args): common_args = { 'pcl_mask': args.pcl_mask, 'drone_mask': args.drone_mask, 'save_masked_pcl': args.save_masked_pcl, 'save_masked_drone': args.save_masked_drone, 'save_keypoints': args.save_keypoints, 'save_csv': args.save_csv, 'save_matching': args.save_matching } result = {} # Preprocess Images img_preprocessor = Preprocessor(drone_img_ori, pcl_img_ori, mask_image) img_preprocessor.preprocessing() imgs = img_preprocessor.get_processed_imgs() processed_drone_img = imgs['processed_drone_img'] processed_pcl_img = imgs['processed_pcl_img'] processed_drone_mask = imgs['processed_drone_mask'] processed_pcl_mask = imgs['processed_pcl_mask'] masked_drone_img = imgs['masked_drone_img'] masked_pcl_img = imgs['masked_pcl_img'] if common_args['save_masked_pcl'] is True: result.update({'masked_pcl': masked_pcl_img}) if common_args['save_masked_drone'] is True: result.update({'masked_drone': masked_drone_img}) # Extract Features drone_feature_extractor = FeatureExtractor(processed_drone_img, "SIFT", args) pcl_feature_extractor = FeatureExtractor(processed_pcl_img, "SIFT", args) if common_args['pcl_mask'] is True: print("pcl_mask: True") pcl_feature_extractor.compute(mask=processed_pcl_mask) else: print("No pcl_mask") pcl_feature_extractor.compute(mask=None) if common_args['drone_mask'] is True: print("drone_mask: True") drone_feature_extractor.compute(mask=processed_drone_mask) else: print('No drone_mask') drone_feature_extractor.compute(mask=None) drone_features, drone_descs = drone_feature_extractor.get_features_and_descriptors( ) pcl_features, pcl_descs = pcl_feature_extractor.get_features_and_descriptors( ) if common_args['save_keypoints'] is True: keypoints_lidar = cv2.drawKeypoints( pcl_img_ori, pcl_features, outImage=np.array([]), color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS) keypoints_drone = cv2.drawKeypoints( drone_img_ori, drone_features, outImage=np.array([]), color=(0, 0, 255), flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS) result.update({'keypoints_lidar_image': keypoints_lidar}) result.update({'keypoints_drone_image': keypoints_drone}) # Find Matching matcher = Matcher(drone_features, drone_descs, pcl_features, pcl_descs, args) matcher.extract_match() raw_matchs = matcher.get_matchs() good_matchs = matcher.get_good_matchs() # Find Homography homography, status = find_homography(drone_features, pcl_features, good_matchs, args) if common_args['save_csv'] is True: result.update({'drone_total_keypoints': len(drone_features)}) result.update({'pcl_total_keypoints': len(pcl_features)}) result.update({'num_inliers': (status.ravel().astype(int) == 1).sum()}) result.update({'num_raw_matches': len(raw_matchs)}) result.update({'num_good_matches': len(good_matchs)}) result.update({'homography': homography}) if common_args['save_matching'] is True: matching1 = matcher.draw_matches(processed_drone_img, processed_pcl_img, status, homography) matching2 = matcher.draw_matches(processed_drone_img, processed_pcl_img, None, homography) matching3 = matcher.draw_matches(processed_drone_img, processed_pcl_img) result.update({'matching1': matching1}) result.update({'matching2': matching2}) result.update({'matching3': matching3}) registated_image = cv2.warpPerspective( drone_img_ori, homography, (processed_pcl_img.shape[1], processed_pcl_img.shape[0])) ret_image = cv2.add(registated_image, cv2.cvtColor(processed_pcl_img, cv2.COLOR_GRAY2BGR)) result.update({'image': ret_image}) return result
return np.mean(predicted - expected) ** 2 if __name__ == "__main__": vp = ViralityPrediction(normalize=True, balance=True, tweet_threshold=50000, score=False, dump_model=False) hashtagIndex = HashtagIndex() virality = {} hashtags_features = {} hashtags_virality = {} hashtags = [k for (k, v) in hashtagIndex.items(sort=True, descending=True, min_values=100)] print "Extracting features..." for hashtag in hashtags: _, featureList, vir = FeatureExtractor.loadFromDB(tweets_id=hashtagIndex.find(hashtag)) hashtags_features[hashtag] = featureList hashtags_virality[hashtag] = vir virality[hashtag] = sum(np.array(vir)[:, 0]) # Sort predictions by value and print top-K results predictions = vp.predict(hashtags_features) sorted_predictions = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True) print "\nTop " + str(ViralityPrediction.K) + " virality predictions:" for i in range(0, min(ViralityPrediction.K, len(sorted_predictions))): print sorted_predictions[i] # Sort expected virality by value and print top-K results #sorted_virality = sorted(virality.items(), key=operator.itemgetter(1), reverse=True) #print "\nTop " + str(ViralityPrediction.K) + " virality expectations:" #for i in range(0, min(ViralityPrediction.K, len(sorted_virality))):
avg = float(sum(relavant_vals)) / len(points) if avg > 0: res[dimension] = avg return res examples = [] laterexamples = [] vectorX = [] with open("Period 5 Rand.csv", 'rb') as file_reader: reader = csv.reader(file_reader, delimiter=",") counter = 0 for line in reader: if counter == 5000: break #to maintain manageable time counter += 1 vectorX.append(line[1]) laterObj = (line[0], FeatureExtractor(line[1]).featureVector()) examples.append(laterObj[1]) laterexamples.append(laterObj) extractor = sklearn.feature_extraction.text.CountVectorizer( input='content', ngram_range=(2, 3), max_df=.7, stop_words='english') X = extractor.fit_transform(vectorX) distArr = [] inerArr = [] for k in range(1, 20): clusterer = sklearn.cluster.KMeans(n_clusters=k) res = clusterer.fit(X) print "Inertia with %d clusters is %d" % (k, clusterer.inertia_) inerArr.append(clusterer.inertia_) #build the dict-style clusters so I can use my same distortion function consistently across them
def createDataset(self, pathRaw, pathLabels, pathVectors, pathCorrespondence): print("Loading spacy") nlp = spacy.load('en_core_web_md') print("loaded") labelIdx = 0 #minWords = 9999 #maxWords = 0 #avgWords = 0 listFiles = os.listdir(pathRaw) nTexts = len(listFiles) fdOutLabels = open(pathLabels, "w") fdOutCorrespondence = open(pathCorrespondence, "w") for i, fname in enumerate(listFiles): label = fname.split("_")[1] if label not in self.labelDict: self.labelDict[label] = labelIdx labelIdx += 1 numeric_label = self.labelDict[label] fdOutLabels.write(str(numeric_label) + "\n") fd = open(pathRaw + fname, "r") raw = fd.read() iF = FeatureExtractor(raw, nlp) #FIRST SENTENCE ONLY NOW instanceVectors = [] for wordDict in iF.features[0]: instanceVectors.append(wordDict["vector"]) #only include maxWords vectors if len(instanceVectors) == self.maxLen: break if len(instanceVectors) < self.maxLen: while len(instanceVectors) < self.maxLen: instanceVectors.append(list(np.zeros(268))) ''' nWords = len(instanceVectors) if nWords > maxWords: maxWords = nWords if nWords < minWords: minWords = nWords avgWords += nWords ''' self.dataset.append(instanceVectors) fd.close() print(i, "of", nTexts) fdOutLabels.close() self.dataset = np.array(self.dataset) #save feature vectors per text np.save(pathVectors, self.dataset) fdOutCorrespondence.write(str(self.labelDict)) fdOutCorrespondence.close() '''
def __init__(self, corpdb=fwc.DEF_CORPDB, corptable=fwc.DEF_CORPTABLE, correl_field=fwc.DEF_CORREL_FIELD, mysql_host="localhost", message_field=fwc.DEF_MESSAGE_FIELD, messageid_field=fwc.DEF_MESSAGEID_FIELD, encoding=fwc.DEF_ENCODING, use_unicode=fwc.DEF_UNICODE_SWITCH, lexicondb=fwc.DEF_LEXICON_DB, featureTable=fwc.DEF_FEAT_TABLE, featNames=fwc.DEF_FEAT_NAMES, date_field=fwc.DEF_DATE_FIELD, outcome_table=fwc.DEF_OUTCOME_TABLE, outcome_value_fields=[fwc.DEF_OUTCOME_FIELD], outcome_controls=fwc.DEF_OUTCOME_CONTROLS, outcome_interaction=fwc.DEF_OUTCOME_CONTROLS, group_freq_thresh=None, featureMappingTable='', featureMappingLex='', output_name='', wordTable=None, model=fwc.DEF_MODEL, feature_selection='', feature_selection_string='', init=None): if feature_selection_string or feature_selection: RegressionPredictor.featureSelectionString = feature_selection if feature_selection else feature_selection_string if init: self.fw = FeatureWorker( corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, date_field, wordTable) if 'fw' in init else None self.fg = FeatureGetter( corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, featureTable, featNames, wordTable) if 'fg' in init else None self.fe = FeatureExtractor( corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, wordTable=wordTable) if 'fe' in init else None self.fr = FeatureRefiner( corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, featureTable, featNames, wordTable) if 'fr' in init else None self.og = OutcomeGetter( corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, outcome_table, outcome_value_fields, outcome_controls, outcome_interaction, group_freq_thresh, featureMappingTable, featureMappingLex, wordTable) if 'og' in init else None self.oa = OutcomeAnalyzer( corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, outcome_table, outcome_value_fields, outcome_controls, outcome_interaction, group_freq_thresh, featureMappingTable, featureMappingLex, output_name, wordTable) if 'oa' in init else None self.rp = RegressionPredictor(self.og, self.fg, model) if 'rp' in init else None self.cp = ClassifyPredictor(self.og, self.fg, model) if 'cp' in init else None else: self.fw = FeatureWorker(corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, date_field, wordTable) self.fg = FeatureGetter(corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, featureTable, featNames, wordTable) self.fe = FeatureExtractor(corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, wordTable=wordTable) self.fr = FeatureRefiner(corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, featureTable, featNames, wordTable) self.og = OutcomeGetter(corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, outcome_table, outcome_value_fields, outcome_controls, outcome_interaction, group_freq_thresh, featureMappingTable, featureMappingLex, wordTable) self.oa = OutcomeAnalyzer( corpdb, corptable, correl_field, mysql_host, message_field, messageid_field, encoding, use_unicode, lexicondb, outcome_table, outcome_value_fields, outcome_controls, outcome_interaction, group_freq_thresh, featureMappingTable, featureMappingLex, output_name, wordTable) self.rp = RegressionPredictor(self.og, self.fg, model) self.cp = ClassifyPredictor(self.og, self.fg, model) self.allFW = { "FeatureWorker": self.fw, "FeatureGetter": self.fg, "FeatureExtractor": self.fe, "FeatureRefiner": self.fr, "OutcomeGetter": self.og, "OutcomeAnalyzer": self.oa, "RegressionPredictor": self.rp, "ClassifyPredictor": self.cp, }
def __init__(self, n): self.data = DataManager('../data/train.csv','../data/test.csv', n) self.fe = FeatureExtractor(self.data) self.eval = Evaluate()
balance=True, tweet_threshold=50000, score=False, dump_model=False) hashtagIndex = HashtagIndex() virality = {} hashtags_features = {} hashtags_virality = {} hashtags = [ k for (k, v) in hashtagIndex.items( sort=True, descending=True, min_values=100) ] print "Extracting features..." for hashtag in hashtags: _, featureList, vir = FeatureExtractor.loadFromDB( tweets_id=hashtagIndex.find(hashtag)) hashtags_features[hashtag] = featureList hashtags_virality[hashtag] = vir virality[hashtag] = sum(np.array(vir)[:, 0]) # Sort predictions by value and print top-K results predictions = vp.predict(hashtags_features) sorted_predictions = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True) print "\nTop " + str(ViralityPrediction.K) + " virality predictions:" for i in range(0, min(ViralityPrediction.K, len(sorted_predictions))): print sorted_predictions[i] # Sort expected virality by value and print top-K results #sorted_virality = sorted(virality.items(), key=operator.itemgetter(1), reverse=True)
class Trip: def __init__(self, vehicle): print("Trip Starts") self.vehicle = vehicle self.timer = 0 self.data = [] self.featureExtractor = FeatureExtractor() self.tripNo = 1 def end_trip(self, status, vehicles): ## Calculate the score score = self.getScore() print("Score", score) if status == "successful": ## Write data in a file np.save("trips/trip"+str(int(time.time())), self.data) #print("Data ", self.data) if status == "collided": ## Write data in a file np.save("trips/trip"+str(int(time.time())) + "_collided", self.data) #print("Data ", self.data) self.reset_trip(vehicles) return score def reset_trip(self, vehicles): print("Previous Trip was for ", self.timer) self.timer = 0 self.tripNo += 1 for vehicle in vehicles: vehicle.reset() self.data = [] def getblob(self, x, y, w, h): buffer = (GLubyte * (3*w*h))(0) glReadPixels(x, y, w, h, GL_RGB, GL_UNSIGNED_BYTE, buffer) # Use PIL to convert raw RGB buffer and flip the right way up image = Image.frombytes(mode="RGB", size=(w, h), data=buffer) image = image.transpose(Image.FLIP_TOP_BOTTOM) img = np.array(image) #print(type(image), image.size) #image.save('jpap.png') return img def getFeatures(self, vehicles): #return self.vehicle.velocity blob = self.getblob(0, 0, 700, 850) return self.featureExtractor.sendFeaturesFrom(self.vehicle, blob, vehicles) def getScore(self): alpha1 = 0.1 alpha2 = 0.1 velocitiesX = [] velocitiesY = [] number_of_collisions = 0 for dat in self.data: velocitiesX.append(dat[0]) velocitiesY.append(dat[1]) if dat[-2] != 0: number_of_collisions += 1 #print("variance and number of collisions ", np.var(velocitiesX), np.var(velocitiesY), number_of_collisions) return alpha1*(np.var(velocitiesX) + np.var(velocitiesY))/2 + alpha2*number_of_collisions def check_trip_status_update(self, road, vehicles, control): ## If vehicle has finished movement or gone out of the screen end the trip and reset the trip is_in_collision = 0 new_trip = False if self.out_of_road(road): print("Out of the Roads") score = self.end_trip("unsuccessful", vehicles) new_trip = True elif self.to_be_collided(vehicles, road): ## Change the color of the car or something like that. print("Collision") score = self.end_trip("collided", vehicles) new_trip = True is_in_collision = 1 elif self.completed_trip(): print("Completed Trip") score = self.end_trip("successful", vehicles) new_trip = True #if control != "No_Control": features = self.getFeatures(vehicles) if new_trip == False: score = self.getScore() features.append(score) features.append(is_in_collision) features.append(control) self.data.append(features) #print("features ", features) self.update_trip() return new_trip, score def completed_trip(self): #print("Completed Trip thing ", self.vehicle.pos) return self.vehicle.pos[0] > 10 and self.vehicle.pos[1] > -10 and self.vehicle.pos[1] < 10 def to_be_collided(self, vehicles, road): ret = False for vehicle in vehicles: if vehicle.idno == self.vehicle.idno: continue v1_bottomx = self.vehicle.pos[0] - self.vehicle.size[0]/2 v1_bottomy = self.vehicle.pos[1] - self.vehicle.size[1]/2 v1_topx = self.vehicle.pos[0] + self.vehicle.size[0]/2 v1_topy = self.vehicle.pos[1] + self.vehicle.size[1]/2 v2_bottomx = vehicle.pos[0] - vehicle.size[0]/2 v2_bottomy = vehicle.pos[1] - vehicle.size[1]/2 v2_topx = vehicle.pos[0] + vehicle.size[0]/2 v2_topy = vehicle.pos[1] + vehicle.size[1]/2 if(not ((v1_topx < v2_bottomx or v2_topx < v1_bottomx) or (v1_topy + 1 < v2_bottomy or v2_topy + 1 < v1_bottomy))): ret = ret or True #print("Vehicle 1 and 2 ", v1_bottomx, v1_bottomy, v1_topx, v1_topy, v2_bottomx, v2_bottomy, v2_topx, v2_topy) #print("Colliding vehicles", vehicle.idno, self.vehicle.idno) return ret def out_of_road(self, road): ret = True for roadSegment in road.getBasicRoad(): left_bottom = [roadSegment.x, roadSegment.y] right_top = [roadSegment.x + roadSegment.width, roadSegment.y + roadSegment.height] vehicle_posX = self.vehicle.pos[0] vehicle_posY = self.vehicle.pos[1] if ((vehicle_posX < left_bottom[0] or vehicle_posX > right_top[0]) or (vehicle_posY < left_bottom[1] or vehicle_posY > right_top[1])): ret = ret and True else: ret = ret and False return ret def update_trip(self): self.timer += 1
## Adding the target vehicle vehicles.append( Vehicle(4, (1, 2, 1), (-5, -40, 0), (1, 1, 0), 0, [0, 0.2, 0], False, "manual" if path_mode == 0 else "physics")) #print("vehicles returned are ", vehicles) actions = Actions() road = Road() time_units = 0 trip = Trip(vehicles[-1]) if path_constraints[path_mode] == "FREE": commander = nn_manual_commander() else: commander = nn_manual_commander() #commander = physics_commander() featureExtractor = FeatureExtractor() def init_vehicles(): ## Set of vehicles moving in the car. global level global trafficInjection global vehicles global time_units global trip global path_mode global commander if level == 0: trafficInjection = UniformTrafficInjection() else: trafficInjection = RandomTrafficInjection()
join(inputPath, f) for f in listdir(inputPath) if isfile(join(inputPath, f)) ] else: imagePaths = [ inputPath, ] print("loading images...") Images = loadBatch(imagePaths) print("loading SVM model...") clf = joblib.load(svmPath) print( "Extracting features, this may take a while for large collections of images..." ) extractor = FeatureExtractor() features = extractor.get_features(Images) classes = clf.best_estimator_.classes_ if hasattr( clf, "best_estimator_") else clf.classes_ print("Predicting the Hieroglyph type...") prob = np.array(clf.predict_proba(features)) top5_i = np.argsort(-prob)[:, 0:5] top5_s = np.array( [prob[row, top5_i[row]] for row, top5_i_row in enumerate(top5_i)]) top5_n = classes[top5_i] print("{:<25} ::: {}".format("image name", "top 5 best matching hieroglyphs")) for idx, path in enumerate(imagePaths): print("{:<25} --> {}".format(os.path.basename(path), top5_n[idx]))
else: featureSettings['polarity'] = set( i for i in args.polarityFeatures.split(',')) #elif args.featuresFile: # features = set(args.featuresFile.read().splitlines()) #else: # features = {} #print(features) if args.markersFile: featureSettings['markersFile'] = args.markersFile #hiddenFileName += '_markers' print(featureSettings) fe = FeatureExtractor(**featureSettings) os.mkdir(args.session) os.chdir(args.session) if not args.dataDir: dataDir = '/local/nlp/chidey/social_meaning/aclImdb/' else: dataDir = args.dataDir main(dataDir, fe, '{}_{}'.format(args.session, 0), args.numtrain) learnCommandTemplate = '/local/nlp/chidey/social_meaning/yessenalina/sle_movieReviews/bin/svm_sle_learn -v 3 -c {0} -l {1} {2}_{3}_{4} hidden_vars_{3}_{4}_{2} model_{3}_{4}'.format( args.c, args.l, '{0}', args.session, '{1}') #2=train/validate/testfile #3=session #4=iteration
def model_training(self): # check if the feature file is present, if so; there is no need to recompute the features # The pre-computed features can also be downloaded from http://iamai.nl/downloads/features.npy if not isfile(self.featurePath): print("indexing images...") Steles = [ join(self.stelePath, f) for f in listdir(self.stelePath) if isdir(join(self.stelePath, f)) ] for stele in Steles: imagePaths = [ join(stele, f) for f in listdir(stele) if isfile(join(stele, f)) ] for path in imagePaths: self.image_paths.append(path) self.labels.append(path[(path.rfind("_") + 1):path.rfind(".")]) featureExtractor = FeatureExtractor() features = [] print("computing features...") for idx, (batch_images, _) in enumerate( batchGenerator(self.image_paths, self.labels, self.batch_size)): print("{}/{}".format((idx + 1) * self.batch_size, len(self.labels))) features_ = featureExtractor.get_features(batch_images) features.append(features_) features = np.vstack(features) labels = np.asarray(self.labels) print("saving features...") np.save(self.featurePath, features) np.save(self.labelsPath, labels) else: print("loading precomputed features and labels from {} and {}". format(self.featurePath, self.labelsPath)) features = np.load(self.featurePath) labels = np.load(self.labelsPath) # on to the SVM trainign phase tobeDeleted = np.nonzero( labels == "UNKNOWN") # Remove the Unknown class from the database features = np.delete(features, tobeDeleted, 0) labels = np.delete(labels, tobeDeleted, 0) numImages = len(labels) trainSet, testSet, trainLabels, testLabels = train_test_split( features, labels, test_size=0.20, random_state=42) # Training SVM, feel free to use linear SVM (or another classifier for that matter) for faster training, however that will not give the confidence scores that can be used to rank hieroglyphs print("training SVM...") if 0: # optinal; either train 1 classifier fast, or search trough the parameter space by training multiple classifiers to sqeeze out that extra 2% clf = linear_model.LogisticRegression(C=10000) else: svr = linear_model.LogisticRegression() parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]} clf = GridSearchCV(svr, parameters, n_jobs=8) clf.fit(trainSet, trainLabels) print(clf) print("finished training! saving...") joblib.dump(clf, self.svmPath, compress=1) prediction = clf.predict(testSet) accuracy = np.sum(testLabels == prediction) / float(len(prediction)) # for idx, pred in enumerate(prediction): # print("%-5s --> %s" % (testLabels[idx], pred)) print("accuracy = {}%".format(accuracy * 100))
def predict(self, features): ''' Predict importances ''' features = self.norm.transform(features[:,0:self.n_comp]) results = self.svr.predict(features) #print results[0:100:5] results = self.std_scaler_i.inverse_transform(results) #print results[0:100:5] return results if __name__ == '__main__': import corpus, featureExtractor from featureExtractor import FeatureExtractor print 'Loading corpus ...' corpus = corpus.TwitterCorpus() tweets = corpus.all_tweets() importances = np.array([featureExtractor.tweet_importance(t) for t in tweets]) # try to load feature vectors try: v = joblib.load('data/cache/vectors.joblib') except: print 'FeatureExtractor fit transform ...' feat = FeatureExtractor() v = feat.train(tweets, importances) joblib.dump(v, 'data/cache/vectors.joblib') print 'HotTweets train ...' ht = HotTweets() ht.train(v[0:1000], importances[0:1000])
class StanceDetector: def __init__(self, n): self.data = DataManager('../data/train.csv','../data/test.csv', n) self.fe = FeatureExtractor(self.data) self.eval = Evaluate() def buildBaseline(self, model): print 'Training baseline',model feats = ['words'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) for mode in ['simple','tfidf']: if model=='bayes': cl = MultinomialNB() elif model=='svm': cl = LinearSVC() if mode=='tfidf': cl = Pipeline([('tfidf', TfidfTransformer()), ('clf', cl), ]) clf = cl.fit(X, y) y_pred = clf.predict(X_test) print mode, accuracy_score(y_true, y_pred) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildSimple(self, model): feats = ['topicVecs','words2vec'] print feats y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) for mode in ['simple']:#,'tfidf']: if model=='bayes': cl = MultinomialNB() elif model=='svm': # cl = LinearSVC() cl = LinearSVC() cl = GridSearchCV(cl, self.getGridSearchParams()) if mode=='tfidf': cl = Pipeline([('tfidf', TfidfTransformer()), ('clf', cl), ]) clf = cl.fit(X, y) # print cl.best_params_ y_pred = clf.predict(X_test) print mode, accuracy_score(y_true, y_pred) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) #train in name means helper function def trainSVC(self, feats, y_attribute, proba=False): X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = SVC(probability=proba) clf = clf.fit(X,y) if proba: y_proba = clf.predict_proba(X_test) return clf, y_proba else: y_pr = clf.predict(X_test) return clf, y_pr def trainLinearSVC(self, feats, y_attribute, dec=False): X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC() clf = clf.fit(X,y) if dec: y_pr = clf.decision_function(X_test) return clf, y_pr else: y_pr = clf.predict(X_test) return clf, y_pr #TODO: revisit #check lable transform encodings of NONE, FAVOR, AGAINST # def buildTopicStanceSeparate(self): # feats = ['words'] # y_attribute = 'stance' # X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) # #builds two separate for topic and stance # topic_clf, y_topic_proba = self.trainLinearSVC(feats = ['words','lexiconsbyword'],y_attribute = 'topic',dec=True) # #WRONR # #WRONG # #WRONG # boost_factors = np.ones_like(y_true) # #multiply by NONE (0) = 0 # #multiply by FAVOR (1) = 1 # #multiply by AGAINST (2) = 2 # #has index of class with max prob for each sample # topic_preds = np.argmax(y_topic_proba,axis=1) # for ind,s in enumerate(y_topic_proba): # prob = y_topic_proba[ind][topic_preds[ind]] # if prob < 0.4: # boost_factors[ind] = 0 #corresponds to NONE # stance_clf,stance_pred = self.trainLinearSVC(feats = ['words','lexiconsbyword','topic'],y_attribute = 'stance') # # for i in range(0, len(stance_pred)): # # if boost_factors[i] == 2: # # stance_pred[i] = self.fe.labelenc.transform("NONE") # #with numpy arrays now, above is equivalent to below , right? # stance_pred = np.multiply(stance_pred, boost_factors) # stance_pred_labels = self.fe.labelenc.inverse_transform(stance_pred) # # print [(self.data.testLabels[i], stance_pred_labels[i]) for i in range(len(stance_pred))] # score = accuracy_score(y_true, stance_pred) # print score # pprint(self.eval.computeFscores(self.data.testTweets, stance_pred_labels)) def buildTopicOnlyMultiple(self): #one svm for each topic feats = ['words2vec'] y_attribute = 'stance' clf_topic = {} for topic in list(self.fe.topicenc.classes_): X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute, topic) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute, topic) clf = LinearSVC() clf = clf.fit(X,y) clf_topic[topic] = clf print topic, clf.score(Xt,yt) # not useful. still less than single SVM. but not as much as avg of above # X_whole,y_whole = self.fe.getFeaturesMatrix('train',feats,y_attribute) # Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) # newX = [] # newXt = [] # for topic in clf_topic: # newX.append(clf_topic[topic].transform(X_whole)) # newXt.append(clf_topic[topic].transform(Xt)) # newX = np.concatenate(tuple(newX),axis=1) # newXt = np.concatenate(tuple(newXt),axis=1) # newclf = LinearSVC() # newclf = newclf.fit(newX, y_whole) # print newclf.score(newXt, yt) def trainTopicSVM(self, topic): feats = ['words2vec','clusteredLexicons','topic1hot'] y_attribute = 'stance' X,y = self.fe.getFeaturesTopicNontopic('train',feats,y_attribute, topic=topic) X_test,y_true = self.fe.getFeaturesTopicNontopic('test',feats,y_attribute, topic=topic) clf = LinearSVC() clf = GridSearchCV(clf,self.getGridSearchParams()) clf = clf.fit(X,y) print clf.best_params_ print topic #,clf.score(X_test, y_true) return clf #WRITE #WRITE #WRITE def buildTopicWise(self): #separate SVC for each topic, tests on that class only first, then on all topic_clf = {} feats = ['words2vec','clusteredLexicons','topic1hot'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) #X matrix for new classifier which uses this as train matrix #has columns of each topic classifier's confidence function # X_fx = [] # X_ftestx = [] preds = [] for topic in list(self.fe.topicenc.classes_): topic_clf[topic] = self.trainTopicSVM(topic) preds.append(topic_clf[topic].predict(X_test)) # X_fx.append(topic_clf[topic].decision_function(X)) # X_ftestx.append(topic_clf[topic].decision_function(X_test)) allpreds = np.vstack(tuple(preds)) topic1hot, temp = self.fe.getFeaturesMatrix('test',['topic1hot'],'stance') # print allpreds.shape, topic1hot.T.shape allpreds[allpreds==5] = 1 final_pred = np.multiply(topic1hot.T,allpreds) prediction = np.sum(final_pred, axis=0).astype(int) # X_fx = np.concatenate(tuple(X_fx), axis=1) # X_ftestx = np.concatenate(tuple(X_ftestx), axis=1) # clf = LinearSVC().fit(X_fx, y) # y_pred = clf.predict(X_ftestx) print accuracy_score(y_true, prediction) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(prediction))) #GOOD 66%acc #1.2 % increase with change topic to 1hot def buildSVMWord2Vec(self): feats = ['words2vec','topic1hot'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC(C=0.01,penalty='l1',dual=False) clf = clf.fit(X,y) y_pred = clf.predict(Xt) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildSVMTrial(self): feats = ['topic1hot','words2vec'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC(C=0.001) clf = clf.fit(X,y) y_pred = clf.predict(Xt) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildTrial(self): # feats = ['pos','words2vec','clusteredLexicons','topic1hot'] # 'givenSentiment','givenOpinion' feats = ['words2vec','pos','clusteredLexicons','top1grams','top2grams'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) # clf = DecisionTreeClassifier() # clf = LogisticRegression() clf = LinearSVC(C=1, class_weight='balanced', penalty='l1',dual=False) clf = clf.fit(X,y) y_pred = clf.predict(Xt) # print y_pred print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]),len(np.where(y_pred>1)[0]) print len(y_pred) print 'training accuracy',clf.score(X, y) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildGithubSGDModel(self): # feats = ['words2vec','topic1hot','pos'] y_attribute = 'stance' dataset = self.fe.getDataset('train') dataset2 = self.fe.getDataset('test') y_train = self.fe.getY('train',dataset, y_attribute) y_test = self.fe.getY('train',dataset2, y_attribute) tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=1.0, min_df=1, binary=True, norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=True, encoding='latin1') X_train = tfidf.fit_transform(self.data.trainTweetsText) X_test = tfidf.transform(self.data.testTweetsText) tuned_parameters = {'alpha': [10 ** a for a in range(-12, 0)]} clf = GridSearchCV(SGDClassifier(loss='hinge', penalty='elasticnet',l1_ratio=0.75, n_iter=10, shuffle=True, verbose=False, n_jobs=4, average=False) , tuned_parameters, cv=10, scoring='f1_weighted') clf.fit(X_train, y_train) print clf.best_params_ print("Grid scores on development set:") for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) print classification_report(y_test, clf.predict(X_test)) print clf.score(X_test, y_test) def getGridSearchParams(self): param_grid = [ {'C': [0.001, 0.01, 0.1, 1], 'dual':[False, True],'class_weight':['balanced',None]} ] return param_grid def getGridSearchParamsForXGBoost(self): param_grid = [ {'n_estimators':[10,20,30,40,50], 'max_depth': [1,2,3,4,5]} ] def buildSVMWord2VecWithClusters(self): #feats = ['topic1hot'] #feats = ['words2vec', 'top1grams', 'top2grams'] #feats = ['words2vec', 'top1grams'] #feats = ['words2vec', 'top2grams'] feats = ['words2vec', 'clusteredLexicons', 'topic1hot', 'pos'] #feats = ['words2vec','topic1hot', 'pos','clusteredLexicons', 'top2grams'] #feats = ['clusteredLexicons'] #feats = ['pos'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) print (X.shape) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC(C=1,penalty='l1',dual=False) clf = clf.fit(X,y) y_pred = clf.predict(Xt) # f = open('pred','w') # for i in y_pred: # #print type(i) # f.write('{0}'.format(i)) # f.close() accuracy = clf.score(Xt, yt) # print clf.score(Xt, yt) fscores = self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)) # print type(fscores) # print fscores # pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) # print (accuracy, fscores['Macro']) return (accuracy, fscores['Macro']) def buildSVMWord2VecWithClustersGridSearch(self): feats = ['words2vec','topic1hot','pos', 'clusteredLexicons'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) svmclf = LinearSVC(C=0.01,penalty='l1',dual=False) clf = GridSearchCV(svmclf, self.getGridSearchParams()) clf = clf.fit(X,y) print clf.best_params_ y_pred = clf.predict(Xt) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def trainStanceNone(self, feats): # feats = ['words2vec','topic1hot','pos'] X,y = self.fe.getFeaturesStanceNone('train',feats) Xt,yt = self.fe.getFeaturesStanceNone('test',feats) svmclf = LinearSVC() stance_none_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y) # print stance_none_clf.score(Xt, yt) pred = stance_none_clf.predict(Xt) print classification_report(yt, pred) return stance_none_clf def trainFavorAgainst(self,feats): # feats = ['words2vec','topic1hot','pos'] X,y = self.fe.getFeaturesFavorAgainst('train',feats) Xt,yt = self.fe.getFeaturesFavorAgainst('test',feats) svmclf = LinearSVC() fav_agnst_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y) pred = fav_agnst_clf.predict(Xt) print classification_report(yt, pred) # print fav_agnst_clf.score(Xt, yt) return fav_agnst_clf def buildModel2(self): #one SVM for Stance/None and other for Favor/Against feats = ['words2vec','topic1hot','pos'] print feats stance_none_clf = self.trainStanceNone(feats) fav_agnst_clf = self.trainFavorAgainst(feats) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,'stance') st_pred = stance_none_clf.predict(X_test) favaga_pred = fav_agnst_clf.predict(X_test) for index,row in enumerate(st_pred): if row==3: st_pred[index] = favaga_pred[index] print classification_report(y_true, st_pred) print accuracy_score(y_true, st_pred) # assert(stance_none_clf.classes_[1]==3) #stance(3) # # >0 means this class - stance will be predicted # # <0 means none is predicted # confi = stance_none_clf.decision_function(X_test) # # treat as confident about none if confi<-0.25: # y_pred = fav_agnst_clf.predict(X_test) # print accuracy_score(y_true, y_pred) # pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) # threshold = -0.25 # confi_high = np.where(confi<threshold)[0] # for loc in confi_high: # y_pred[loc] = self.fe.labelenc.transform('NONE') # print 'Boosted', accuracy_score(y_true, y_pred) # print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]), len(np.where(y_pred==2)[0]), # pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def get_proba_one(self, model, X): predicted = model.predict_proba(X) return predicted[:, 1] def runXGBoostModel(self,model, model_name, X, target, X_test, crossOn): print "Trying to fit model" print X.shape, target.shape model.fit(X, target) print "Successfully fit model" predicted = self.get_proba_one(model, X) predicted_test = self.get_proba_one(model, X_test) predicted_test = model.predict(X_test) print predicted_test return predicted_test def word2VecXGBoost(self): feats = ['words2vec','pos','clusteredLexicons', 'top1grams','top2grams', 'topic1hot' ] #feats = ['words2vec'] #feats = ['clusteredLexicons'] #feats = ['pos'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) print (X.shape) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) #clf = LinearSVC(C=0.01,penalty='l1',dual=False) #clf = clf.fit(X,y) #y_pred = clf.predict(Xt) # f = open('pred','w') # for i in y_pred: # #print type(i) # f.write('{0}'.format(i)) # f.close() #print clf.score(Xt, yt) #pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) m2_xgb = xgb.XGBClassifier(n_estimators=10, nthread=-1, max_depth = 2 , seed=500) #m2_xgb = GridSearchCV(m2_xgb, self.getGridSearchParamsForXGBoost()) print "Run Model" y_pred = self.runXGBoostModel(m2_xgb, "m2_xgb_OS_ENN", X, y, Xt, True) # print type(yt) # print type(y_pred) # print len(yt) # print len(y_pred) # print yt.shape # print y_pred.shape # print yt # print y_pred # print(m2_xgb) print accuracy_score(yt, y_pred) def buildModel3(self): #feats = [['words2vec'],['pos'],['clusteredLexicons']] feats = [['words2vec'],['pos'],['clusteredLexicons']] y_attribute = 'stance' y_pred = [] y_t = [] for f in feats: X,y = self.fe.getFeaturesMatrix('train',f,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',f,y_attribute) clf = SVC(C=1, probability=True) clf = clf.fit(X,y) train_transform = clf.predict_log_proba(X) test_transform = clf.predict_log_proba(Xt) # print 'Train transform ',train_transform.shape # print 'Test transform ',test_transform.shape y_pred.append(train_transform) y_t.append(test_transform) #y_pred_h = np.hstack(tuple(y_pred)) #y_t_h = np.hstack(tuple(y_t)) x = 0 for i in y_pred: x += i y_pred_h = x x = 0 for i in y_t: x += i y_t_h = x # print type(y_pred_h) # print y_pred_h[0] # print y_pred_h.shape regr = linear_model.LogisticRegression() regr.fit(y_pred_h, y) final_pred = regr.predict(y_t_h) print accuracy_score(final_pred, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(final_pred)))