def generate_positives(frame, bbox, max_shift=0.08, max_scale=0.08, nparray=True): bbox = (int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])) positives = [] # roi = frame[bbox[1]:bbox[1]+bbox[3],bbox[0]:bbox[0]+bbox[2]] positives.append(FeatureExtractor.extract_hog_hsv(frame, bbox)) # shift in 4 directions shift_samples = bboxShift(frame, bbox, max_shift=max_shift) for newbox in shift_samples: positives.append(FeatureExtractor.extract_hog_hsv(frame, newbox)) # flip horizontally roi = frame[bbox[1]:bbox[1] + bbox[3], bbox[0]:bbox[0] + bbox[2]] positives.append(FeatureExtractor.extract_hog_hsv(cv2.flip(roi, 1))) positives.append( FeatureExtractor.extract_hog_hsv(cv2.GaussianBlur(roi, (5, 5), 15))) # 3 scaled samples scale_samples = bboxScale(frame, bbox, max_scale=max_scale) for newbox in scale_samples: positives.append(FeatureExtractor.extract_hog_hsv(frame, newbox)) if nparray: return np.array(positives) else: return positives
def extractFeatures(): samples = DataSample.objects.filter(datafeature=None) fs = FileSystemStorage() for i in samples: ftype = i.dataType print ftype if ftype == 'audio': [mfcc_path, diff_path, stft_path] = fe.extractAudioFeatures(fs.path(i.dataFile.name)) with open(mfcc_path) as ff: f1 = DataFeature(dataSample=i, featureType='audio-mfcc', featureFile=File(ff)) f1.save() with open(diff_path) as ff: f1 = DataFeature(dataSample=i, featureType='audio-mfcc-diff', featureFile=File(ff)) f1.save() with open(stft_path) as ff: f2 = DataFeature(dataSample=i, featureType='audio-stft', featureFile=File(ff)) f2.save() if ftype == 'image': file_paths = fe.extractImageFeatures([fs.path(i.dataFile.name)])
def __init__(self): self.wi = WebInterface() self.fe = FeatureExtractor("") self.price = '' self.purl = '' self.pid = 0 self.modeset = 'accurate'
def __init__(self, featureList, parameters): self.featureList = featureList self.parameters = parameters self.svc = None self.scaler = None self.featureExtractor = FeatureExtractor(self.parameters, self.featureList) self.carData = None self.notCarData = None self.carPath = [ "training_images\\vehicles\\vehicles\\GTI_Far", "training_images\\vehicles\\vehicles\\GTI_Left", "training_images\\vehicles\\vehicles\\GTI_MiddleClose", "training_images\\vehicles\\vehicles\\GTI_Right", "training_images\\vehicles\\vehicles\\KITTI_extracted" ] self.notCarPath = [ "training_images\\non-vehicles\\non-vehicles\\Extras", "training_images\\non-vehicles\\non-vehicles\\GTI", ] self.carFeatures = None self.notCarFeatures = None
def run_extra_feature_extractor(self): config = self.CONFIG body_text_dir = config['body_text_dir'] extra_feature_dir = config['extra_feature_dir'] print 'begin run extra feature extractor from %s' % body_text_dir print 'Save extra features to %s' % extra_feature_dir FeatureExtractor.run_extra(body_text_dir, extra_feature_dir, config) print 'finish save extra features to %s' % extra_feature_dir return None
def run_feature_extractor(self): config = self.CONFIG body_text_dir = config['body_text_dir'] feature_dir = config['feature_dir'] print 'begin run feature extractor from %s' % body_text_dir print 'Save features to %s' % feature_dir FeatureExtractor.run(body_text_dir, feature_dir) print 'finish save features to %s' % feature_dir return None
def doClassification(self): if len(self.testData) > 0: print(__name__ + '\tTesting Data Shape : ' + str(self.testData.shape)) self.testData = FeatureExtractor.preprocessData(self.testData) self.testFeatures = FeatureExtractor.getFeatures(self.testData) print(__name__ + '\tTest Features extracted') self.classify()
def extractFeaturesML(db_name, DIR): key_users = ['UserId'] key_posts = ['PostId'] # extracts all features extractor.extractForML(db_name, DIR) dfML = mf.mergeAll(DIR, key_users, key_posts, FeaturePredict) dfML.to_csv(DIR + 'dataML.csv', index=False) dfnorm = dp.normalize(dfML) return dfnorm
def __init__(self, featureList, parameters, svc, scaler): self.featureList = featureList self.parameters = parameters self.svc = svc self.scaler = scaler self.featureExtractor = FeatureExtractor(self.parameters, self.featureList) self.heatmapHistory = collections.deque( maxlen=parameters.SearchSettings.averages)
def extractLayerFeat(idx_ls, scale_size=224): extractor = FeatureExtractor(cache_folder=model_cache_folder, which_net='vgg16', which_layer=VC['layer'], which_snapshot=0) ''' assert(os.path.isfile(Dict['Dictionary'])) with open(Dict['Dictionary'], 'rb') as fh: _,centers,_ = pickle.load(fh) ''' for ii in idx_ls: file_list = Dataset['file_list'].format(ii) with open(file_list, 'r') as fh: content = fh.readlines() img_list = [x.strip() for x in content] img_num = len(img_list) print('total number of images for idx {1}: {0}'.format(img_num, ii)) img_list = img_list[0:3000] img_num = len(img_list) print('used number of images for idx {1}: {0}'.format(img_num, ii)) feat_set = [None for nn in range(img_num)] for nn in range(img_num): file_img = os.path.join(Dataset['img_dir'], img_list[nn]) assert (os.path.isfile(file_img)) img = cv2.imread(file_img) patch = myresize(img, scale_size, 'short') layer_feature = extractor.extract_feature_image(patch)[0] assert (featDim == layer_feature.shape[2]) feat_set[nn] = layer_feature ''' iheight, iwidth = layer_feature.shape[0:2] layer_feature = layer_feature.reshape(-1, featDim) feat_norm = np.sqrt(np.sum(layer_feature**2, 1)).reshape(-1,1) layer_feature = layer_feature/feat_norm dist = cdist(layer_feature, centers, 'cosine').reshape(iheight,iwidth,-1) assert(dist.shape[2]==centers.shape[0]); r_set[nn] = dist ''' if nn % 100 == 0: print(nn, end=' ') sys.stdout.flush() print('\n') file_cache_feat = os.path.join( Feat['cache_dir'], '{0}_feat_{1}.pickle'.format(VC['layer'], ii)) with open(file_cache_feat, 'wb') as fh: pickle.dump(feat_set, fh)
def append_vectors_to_t(t): #Get file names articlesDirectory = "../Articles/" fileNames = FileUtil.getFileNames(articlesDirectory) #Create feature Vectors numOfFeatureVectors = len(fileNames) for file in fileNames: # fileArg is set to directory of each files # fVector is feature vector of each file then it is written to file fileArg = articlesDirectory + file fVector = FeatureExtractor.createFeatureVector(fileArg) fVectorNormalized = FeatureExtractor.normalize(fVector) t.append(fVectorNormalized) #Append feature vectors to t.
def compute_features(config, mp=False): import FeatureExtractor import movieqa_importer feature_extractor = FeatureExtractor.FeatureExtractor(config['video_features']) mqa = movieqa_importer.MovieQA.DataLoader() vl_qa, _ = mqa.get_video_list('full', 'all_clips') desc = config['video_features'] document_type = 'video_clips' check_save_directory(filename=utils.DOC_DESC_TEMPLATE %(document_type, desc, '')) videos = [] outputs = [] for i, imdb_key in enumerate(vl_qa.keys()): current_videos = map(lambda x: os.path.join('/cs/vml4/smuralid/datasets/movieqa/MovieQA_benchmark','story', document_type, imdb_key, x), vl_qa[imdb_key]) videos.extend(current_videos) output = map(lambda x: os.path.join((utils.DOC_DESC_TEMPLATE % (document_type, desc, imdb_key))[:-4], x.replace('.mp4', '')), vl_qa[imdb_key]) outputs.extend(output) if mp: inputs = [[x, y] for x,y in zip(videos, outputs)] from random import shuffle shuffle(inputs) #pool = Pool(mp) #pool.map(feature_extractor.extract_features, inputs) for x in inputs: feature_extractor.extract_features(x) else: for video, output in zip(videos, outputs): feature_extractor.extract_features([video, output])
def init_static_dialog_agent(args) : print "reading in Ontology" ont = Ontology.Ontology(sys.argv[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "reading in Lexicon" lex = Lexicon.Lexicon(ont, sys.argv[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) load_parser_from_file = False if len(args) > 4 : if args[4].lower() == 'true' : load_parser_from_file = True if load_parser_from_file : parser = load_model('static_parser') grounder.parser = parser grounder.ontology = parser.ontology else : print "instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10, safety=True) print "instantiating Generator" generator = Generator.Generator(ont, lex, learner, parser, beam_width=sys.maxint, safety=True) print "instantiating DialogAgent" static_policy = StaticDialogPolicy.StaticDialogPolicy() A = StaticDialogAgent(parser, generator, grounder, static_policy, None, None) if not load_parser_from_file : print "reading in training data" D = A.read_in_utterance_action_pairs(args[3]) if len(args) > 4 and args[4] == "both": print "training parser and generator jointly from actions" converged = A.jointly_train_parser_and_generator_from_utterance_action_pairs( D, epochs=10, parse_beam=30, generator_beam=10) else: print "training parser from actions" converged = A.train_parser_from_utterance_action_pairs( D, epochs=10, parse_beam=30) print "theta: "+str(parser.learner.theta) save_model(parser, 'static_parser') return A
def rate(self, board): # neural network will go here if self.net == None: return random.random() else: return self.net.predict([np.array(fe.extract_features(board))])[0, 0]
def positive_score(frame, bbox, classifier='svm'): features = FeatureExtractor.extract_hog_hsv(frame, bbox) if classifier == 'adaboost': return bdt.score(np.array([features]), [0]) #return clf.score(np.array([features]), [0] score = clf.predict_proba([features]) #print(score) return score[0][0]
def upload_agent(): if request.method == 'POST': #filez = request.files['file'] #print(filez.filename, file=sys.stderr) #extension = os.path.splitext(filez.filename)[1] #f_name = str(uuid.uuid4()) + extension #filez.save(os.path.join('./', f_name)) #print(request.data) #print(request.form) #print(request.args) data_dict = dict(request.form) #print(data_dict.keys()) #terrible! jdata = data_dict['data'] #this is what we get in here a form with data myjson = jdata[0] #print(type(myjson)) mydata = json.loads(myjson)['data'][0] #print(mydata[0]) #save the data... outputfilename = "./temp_agent.wav" with open(outputfilename, 'wb') as output: output.write(bytearray(map(lambda x: chr(x % 256), mydata))) #the emotion detector object to be used. global model global Rescaler file_features = FeatureExtractor.extract_features(outputfilename) file_features = Rescaler.transform(file_features) prediction= model.predict(file_features) #TYPO IN HAPPINESS !!! body={"mode":"sync", "messageType":"70281a5b78eba98c2e2c", "messages":[{"Anger":round(prediction[0][0],2), "Disgust":round(prediction[0][1],2), "Fear":round(prediction[0][2],2), "Hapiness":round(prediction[0][3],2),"Neutral":round(prediction[0][4],2), "Sadness":round(prediction[0][5],2), "Surprise":round(prediction[0][6],2) }]} #body='{"mode":"sync", "messageType":"70281a5b78eba98c2e2c", "messages":[{"Anger":0.0, "Disgust":0.0, "Fear":0.0, "Hapiness":0.1,"Neutral":0.1, "Sadness":0.1, "Surprise":0.2 }]}' print('AGENT EMOTIONS') '''print(body) try: r = http.urlopen('POST', url, body=str(body), headers=headers)# for the engineer coming after me, screw you ! print(r.status) print(r.data) except urllib3.exceptions.SSLError as e: print (e) print(prediction) ''' #thread = SubmitterTDR(1,body) #thread.start() #an array of probabilities... return json.dumps({'predictions': prediction.tolist()[0]})
def extractFeatures(): emails = fe.read_from_disk(INBOX_DIRECTORY, 2) inboxEmails = [] # seprate sent and inbox emails for email in emails: email = fe.parseEmail(email) inboxEmails.append(email) inboxEmailFeatures = [] output = [] # find numerical features for email in inboxEmails: features = {} features["sender_frequency"] = fe.senderFrequency( email["From"], inboxEmails) features["is_automated_mail"] = fe.words_present( email["emailText"], fe.negative_words) if(email["Subject"]): features["is_interrogative_text"] = fe.words_present( email["emailText"] + email["Subject"], fe.interrogative_words) else: features["is_interrogative_text"] = fe.words_present( email["emailText"], fe.interrogative_words) features["Cc"] = email["Cc"] # features["reply"] = replied(email["Message-ID"],sentEmails) featureslist = fe.dictList(features) inboxEmailFeatures.append(featureslist) # print("Mail Text: ", email['emailText']) return inboxEmailFeatures, email['emailText']
def test(doc, name): f= open(name,"w") frequencies0 = FeatureExtractor.frequency(doc[:2],True,True) #frequency count smoothed by 1 frequencies1 = FeatureExtractor.augmented_frequency(frequencies0) # augmented frequencies taking into account document size frequencies = FeatureExtractor.idf(frequencies1) # idfs total = frequencies["**prob**"] totals = sum(total) priors =[0.33, 0.33, 0.329] # based on number of documents a = ["C2","IKEA_EN","IKEA_IT"] with open(doc[2],"r") as mefile: for line in mefile: lines = line.split('\t') ID = lines[0] words = lines[4].replace("<s>","").replace("</s>","").split(" ") pC2 = 0 pIKEA_IT = 0 pIKEA_EN = 0 for word in words: if word in frequencies: pC2 += math.log((frequencies[word][0])) pIKEA_EN += math.log((frequencies[word][1])) pIKEA_IT += math.log((frequencies[word][2])) else: pC2 += math.log(0.5) pIKEA_EN += math.log(0.5) pIKEA_IT += math.log(0.5) b = [pC2+math.log(priors[0]),pIKEA_EN+math.log(priors[1]),pIKEA_IT+math.log(priors[2])] proposal = a[b.index(max(b))] f.write(ID+ "\t" + proposal + "\n") f.close()
class PriceCheck(object): def __init__(self): self.wi = WebInterface() self.fe = FeatureExtractor("") self.price = '' self.purl = '' self.pid = 0 self.modeset = 'accurate' def set_new_mode(self, new_modeset): self.modeset = new_modeset def reinit(self, purl, pid): self.purl = purl self.pid = pid def deinit(self): self.fe.deinit() def call_wi_to_update_price(self): if self.price: self.wi.price_update(self.pid, self.price) def get_price(self): self.fe.reinit(self.purl) self.fe.set_mode(self.modeset) #'accurate') self.fe.run() self.price = self.fe.price self.sale_price = self.fe.sale_price self.regular_price = self.fe.regular_price self.was_price = self.fe.was_price def single_run(self): products = self.wi.get_prod_list_for_price_update() if not products: print "No more products to check price for." return for prod in products: if 'Product' in prod: purl = prod['Product']['purl'] pid = prod['Product']['id'] print "Getting new price for : " + purl print "Pid : " + pid self.reinit(purl, pid) self.get_price() print "New price :" + str(self.price) self.call_wi_to_update_price() print "---" def run(self): while True: self.single_run() print "...zzz...5 mins...zzz..." time.sleep(300)
def getAverageVector(author): x = [] articlesDirectory = "../Articles/" fileNames = FileUtil.getFileNames(articlesDirectory) for file in fileNames: if (author in file): fileArg = articlesDirectory + file fVector = FeatureExtractor.createFeatureVector(fileArg) fVectorNormalized = FeatureExtractor.normalize(fVector) x.append(fVector) sumElement = 0 avgVector = [] for i in range(len(x[0])): for j in range(len(x)): sumElement += x[j][i] avgVector.append(sumElement/len(x[0])) return avgVector
def predict(frame, bbox, classifier="svm"): bbox = (int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])) features = FeatureExtractor.extract_hog_hsv(frame, bbox) if classifier == 'adaboost': pred = bdt.predict(np.array([features])) else: pred = clf.predict(np.array([features])) #print(pred) if pred[0] == 0: return True return False
def run_BOW_baseline(args): print(args) train, dev, test = load_datasets(args) classnames = list(set(map(lambda tweet: tweet.label, train))) train_y = map(lambda tweet: classnames.index(tweet.label), train) dev_y = map(lambda tweet: classnames.index(tweet.label), dev) test_y = map(lambda tweet: classnames.index(tweet.label), test) # fx = FeatureExtractor(["BOW"], stopwords=args.stopwords) fx = FeatureExtractor(["hand-coded"], stopwords=args.stopwords) fx.build_vocab(train) train_x = np.asarray(map(lambda tweet: fx.process(tweet), train)) check = train_x[0] print("sample fv shape: ", check.shape) dev_x = np.asarray(map(lambda tweet: fx.process(tweet), dev)) test_x = np.asarray(map(lambda tweet: fx.process(tweet), test)) nclasses = len(classnames) ntrain = train_x.shape[0] nbatches = 100 batch_size = ntrain/nbatches train_data = (train_x, train_y) dev_data = (dev_x, dev_y) test_data = (test_x, test_y) neural_net.logistic_regression_optimization_sgd(train_data, dev_data, test_data, nclasses, batch_size=batch_size) print("train set performance:") train_ypred = neural_net.predict(train_x, train_y) print(evaluate.ConfusionMatrix(train_y, train_ypred, classnames)) print("validation set performance:") dev_ypred = neural_net.predict(dev_x, dev_y) print(evaluate.ConfusionMatrix(dev_y, dev_ypred, classnames)) print("test set performance:") test_ypred = neural_net.predict(test_x, test_y) print(evaluate.ConfusionMatrix(test_y, test_ypred, classnames))
def init_pomdp_dialog_agent(args) : print "Reading in Ontology" ont = Ontology.Ontology(args[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "Reading in Lexicon" lex = Lexicon.Lexicon(ont, args[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "Instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "Instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "Instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) load_models_from_file = False if len(args) > 4 : if args[4].lower() == 'true' : load_models_from_file = True if load_models_from_file : parser = load_model('pomdp_parser') grounder.parser = parser grounder.ontology = parser.ontology else : print "Instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10) print "Instantiating DialogAgent" if load_models_from_file : agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=True) else : agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=False) if not load_models_from_file : print "reading in data and training parser from actions" D = agent.read_in_utterance_action_pairs(args[3]) converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30) print "theta: "+str(parser.learner.theta) save_model(parser, 'pomdp_parser') #print 'Parser ontology : ', parser.ontology.preds return agent
def main(): print("\nFetching Emails...\n") # gui.interfaceFetchEmails() username = "******" password = "******" # ef.login(username, password) print("Extracting Features...\n") clss = fe.extractFeatures() pr.login(username, password) status, msg = pr.predict(clss) print(msg) if (status): ts.pred_ans(msg)
def run(is_changing_n=False, number_to_ngram=3, number_of_features=220, fs_method=FEATURE_SELECTION_MOST_COMMON, c_method=CLASSIFIER_ONE_CLASS_SVM): # set param for final calcs all_ans = [] # Feature extraction out_0_file = Path(out_0_path) if is_changing_n: fe.export_to_csv_all_users(number_to_ngram) if not out_0_file.exists(): fe.export_to_csv_all_users(number_to_ngram) for user_number in range(0, 5): # Feature selection FeatureSelection.select_features(number_of_features, fs_method, user_number) # Classifier ans = Classifier.classify(number_of_features, c_method) all_ans.append(ans) print(""" ** FINAL SCORE : {} ** """.format(sum(all_ans)/len(all_ans)))
def run(): ''' Establish directory paths for training and validation data ''' ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sunny_train_dirpath = ROOT_DIR + '/data/c_sunny/' overcast_train_dirpath = ROOT_DIR + '/data/c_overcast/' sunny_test_dirpath = ROOT_DIR + '/data/3000_images_test/c_sunny/' overcast_test_dirpath = ROOT_DIR + '/data/3000_images_test/c_overcast/' ''' Perform feature extraction on training data ''' sunny_train_c = [f for f in os.listdir(sunny_train_dirpath)] overcast_train_c = [f for f in os.listdir(overcast_train_dirpath)] train_feats = extractor.get_hog_hist_features(sunny_train_c, overcast_train_c, sunny_train_dirpath, overcast_train_dirpath) ''' Perform feature extraction on validation data ''' sunny_test_c = [f for f in os.listdir(sunny_test_dirpath)] overcast_test_c = [f for f in os.listdir(overcast_test_dirpath)] test_feats = extractor.get_hog_hist_features(sunny_test_c, overcast_test_c, sunny_test_dirpath, overcast_test_dirpath) ''' Naive Bayes Classifier ''' train_feats = shuffle(train_feats) test_feats = shuffle(test_feats) X_train = train_feats.iloc[:, :len(train_feats.columns) - 1] y_train = train_feats['label'] X_test = test_feats.iloc[:, :len(test_feats.columns) - 1] y_test = test_feats['label'] clf_nb = GaussianNB() model = clf_nb.fit(X_train, y_train) y_pred = model.predict(X_test) acc_score = accuracy_score(y_test, y_pred, normalize=True) mat = confusion_matrix(y_test, y_pred) print(mat) print("Number of mislabeled points out of a total {0} points : {1}".format( len(X_test), (y_test != y_pred).sum())) print("Naive Bayes Classifier test accuracy = ", acc_score)
def generate_sets(): people = os.listdir('Datasets') del people[people.index('Test')] X = [] Y = [] for person in people: samples = os.listdir('Datasets//' + person) X += [ FeatureExtractor.extract_features('Datasets//{}//{}'.format( person, sample))[0] for sample in samples ] Y += [int('Alex' in person) for _ in range(len(samples))] time.sleep(1) X, Y = shuffle(X, Y, random_state=0) X = np.array(X) return X, Y
def train(trainingSet, subredditLabels, args): numIterations = 20 eta = 0.05 #dictionary of dictionaries (weights) weightDict = {} for label in subredditLabels: weightDict[label] = {} def gradLoss(phiX, w, y): score = util.dotProduct(w, phiX) margin = score * y if margin < 1: for name, feature in phiX.iteritems(): phiX[name] = -1 * y * feature return phiX else: return 0 for label in subredditLabels: trainingSet.seek(0) weightVector = weightDict[label] for i in range(numIterations): for example in trainingSet: example = json.loads(example) title = example['title'] subreddit = example['subreddit'] features = FeatureExtractor.extractFeatures(title, args) y = -1 if label == subreddit: y = 1 grad = gradLoss(features, weightVector, y) if grad != 0: util.increment(weightVector, -1 * eta, grad) weightDict[label] = weightVector else: weightDict[label] = weightVector return weightDict
def analyse(self, image, orig): image_features = FeatureExtractor.getFeatures(image) image_features = np.array([image_features]) image_features = self.scaler.transform(image_features) confidence = np.amax(self.classifier.predict_proba(image_features)) if confidence > 0.7: self.last_gesture = str([ self.training_names[i] for i in self.classifier.predict(image_features) ][0]) orig = self.writeLastGesture(orig) else: self.last_gesture = "Unsure" orig = self.writeLastGesture(orig) return image, orig, self.last_gesture
def init_dialog_agent(args): print "Reading in Ontology" ont = Ontology.Ontology(args[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "Reading in Lexicon" lex = Lexicon.Lexicon(ont, args[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "Instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "Instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "Instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) print "Instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10) parser = load_model('parser') grounder.parser = parser grounder.ontology = parser.ontology print "Instantiating DialogAgent" agent = PomdpDialogAgent(parser, grounder, None, None) #print "reading in data and training parser from actions" #D = agent.read_in_utterance_action_pairs(args[3]) #converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30) #print "theta: "+str(parser.learner.theta) #save_model(parser, 'parser') #print 'Parser ontology : ', parser.ontology.preds return agent
def predict(weights, testSet, args): correct = 0 incorrect = 0 total = 0 for data in testSet: data = json.loads(data) title = data['title'] subreddit = data['subreddit'] features = FeatureExtractor.extractFeatures(title, args) maxScore = float('-inf') prediction = '' for key in weights.keys(): weightVector = weights[key] score = util.dotProduct(weightVector, features) if score > maxScore: prediction = key maxScore = score if prediction == subreddit: correct += 1 else: if args.verbose: try: print title print "predicted: " + prediction.encode('utf-8') print features printRelevantWeights(weights, features) print "-----------------" except UnicodeEncodeError: print "error" incorrect += 1 total += 1 print 'accuracy ' + str(float(correct) / total) print 'wrong ' + str(float(incorrect) / total)
def run_word2vec_baseline(args): print(args) print('subtask id: %s' % args.subtask_id) train, dev, test = load_datasets(args) classnames = list(set(map(lambda tweet: tweet.label, train))) train_y = map(lambda tweet: classnames.index(tweet.label), train) dev_y = map(lambda tweet: classnames.index(tweet.label), dev) test_y = map(lambda tweet: classnames.index(tweet.label), test) fx = FeatureExtractor(["word2vec"], word2vec_model=args.word2vec_model) fx.build_vocab(train) train_x = np.asarray(map(lambda tweet: fx.process(tweet), train)) check = train_x[0] print("sample fv shape: ", check.shape) dev_x = np.asarray(map(lambda tweet: fx.process(tweet), dev)) test_x = np.asarray(map(lambda tweet: fx.process(tweet), test)) nclasses = len(classnames) ntrain = train_x.shape[0] nbatches = 100 batch_size = ntrain / nbatches train_data = (train_x, train_y) dev_data = (dev_x, dev_y) test_data = (test_x, test_y) neural_net.logistic_regression_optimization_sgd(train_data, dev_data, test_data, nclasses, batch_size=batch_size) print("train set performance:") train_ypred = neural_net.predict(train_x, train_y) print(evaluate.ConfusionMatrix(train_y, train_ypred, classnames)) print("validation set performance:") dev_ypred = neural_net.predict(dev_x, dev_y) print(evaluate.ConfusionMatrix(dev_y, dev_ypred, classnames)) print("test set performance:") test_ypred = neural_net.predict(test_x, test_y) print(evaluate.ConfusionMatrix(test_y, test_ypred, classnames))
def __init__(self,Device = 1, Input = True, Channels = 2, THRESHOLD = 500, CHUNK_SIZE = 1024, FORMAT = pyaudio.paInt16, RATE = 8000, RECORD_SECONDS = 3.25,WAVE_OUTPUT_FILENAME_EXTENSION = 0, EXPORT_FOLDER= "Recordings", WAVE_OUTPUT_FILENAME = "output", BASELINE = 'baseline_mean_sd.pickle', URL= "http://localhost:50000/annotate"): self.myqueue= deque([]) self.Input = Input self.Device = Device self.Channels=Channels self.URL = URL if (Input==False): self.Output = True self.Channels = 0 self.THRESHOLD = THRESHOLD self.EXPORT_FOLDER = EXPORT_FOLDER self.CHUNK_SIZE = CHUNK_SIZE self.FORMAT = FORMAT self.RATE = RATE self.RECORD_SECONDS = RECORD_SECONDS self.WAVE_OUTPUT_FILENAME_EXTENSION = WAVE_OUTPUT_FILENAME_EXTENSION self.WAVE_OUTPUT_FILENAME = WAVE_OUTPUT_FILENAME self.q = Queue() self.lock = threading.Lock() self.stopRecordingEvent = None self.fe = FeatureExtractor.FeatureExtractor(BASELINE)
def search(request): if request.POST: # print(request.POST['q']) query = request.POST['q'] # print(query) # 查询词加入搜索历史列表 res['history'].append(query) # 进行查询推荐 result = FeatureExtractor.do_query(query) # 获取推荐词和其对应的相似度 similarity, recommend = zip(*result) # 取第TOP_K个推荐词的精度,若数量不足TOP_K个则取最后一个 if len(similarity) >= FeatureExtractor.TOP_K: res['precise'] = similarity[FeatureExtractor.TOP_K - 1] else: res['precise'] = similarity[-1] res['sentiment'] = sentiment_analysis.getscore_recommend(recommend) res['recommend'] = recommend elif request.GET: pass return render(request, "search_post.html", res)
def main(): print "Loading previously stored HMM..." hmm = get_stored_hmm() #hmm = getHMM() print hmm.A print print hmm.B print hmm.pi print "HMM Created successfully" print "Loading previously stored RF model" RF = joblib.load('RFModel.pk') print "RF loaded successfully" # Feature Extractor FE = FeatureExtractor() FE.set_class(1) # Open socket and start listening for data s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind((HOST, PORT)) s.listen(1) subactivities = get_subactivities() activities = get_activities() conn, addr = s.accept() while True: data = conn.recv(20000) #print data if data: #print data data = data.split('\n') # Remove extra newline del data[-1] csv_data = [] for d in data: #print len(d) if len(d) >= 53: d2 = d[14:].strip('"') d3 = np.fromstring(d2, dtype=int, sep=',') csv_data.append(d3.tolist()) #print len(csv_data) if len(csv_data) > 0: csv_data = np.array(csv_data) #row,col = csv_data.shape #print row, col features = [] for x in range(4): f = FE.get_featurelist_from_nparray(csv_data, 2*x, 2*x + 1) for y in f: p = RF.predict(y[:25])[0] if len(history[x]) >= HISTORY_LEN: history[x].pop(0) history_labels[x].pop(0) history[x].append(p) history_labels[x].append(subactivities[int(p)]) for sub in subactivities: print sub, get_subactivity_class(sub) for z in range(4): hmm.mapB(history[z]) alpha = hmm.calculate_alpha(history[z]) #alpha_normalized = alpha alpha_normalized = alpha.astype('float') / alpha.sum(axis=1)[:, np.newaxis] c = most_common(history[z]) if len(history) > 12: print z+1, history_labels[z][12:], subactivities[int(c)] else: print z+1, history_labels[z][5:], subactivities[int(c)] print alpha_normalized[-1] print print "\n--------------\n"
from FeatureExtractor import * ''' run this script to perform feature extraction and store the vectors as files ''' if __name__ == '__main__': ex = FeatureExtractor() ex.processAllDocs() print "Done processing documents" ex.build() ex.TFIDF() print "Done generating feature vectors" ex.saveAllVectors() ex.saveWordList() ex.saveFilenames() print "Features, wordList and filenames have been saved."
featureSetsToUse["statistics"] = True # counting statistics featureSetsToUse["frequency"] = True # frequency statistics featureSetsToUse["hapax"] = True # hapax count wf = open("deleteme.txt", 'w') wf.writelines("Sentence ||| proper noun percentage, word variance, prp, wordCount\n") wf.writelines("========================================================================\n\n") for data in positives: wf.writelines(data.chunk + "\n") # wf.writelines(str(data.features)) # wf.writelines("\n") feats = FeatureExtractor.langFeatures(data, featureSetsToUse) for i in feats.keys(): wf.writelines(i + " ") wf.writelines(str(feats[i])) wf.writelines(" ") wf.writelines("\n") wf.writelines("========================================================================\n") for data in negatives: wf.writelines(data.chunk + "\n") # wf.writelines(str(data.features)) # wf.writelines("\n") feats = FeatureExtractor.langFeatures(data, featureSetsToUse)
def main(): # print "Loading previously stored HMM..." # hmm = get_stored_hmm() # #hmm = getHMM() # print hmm.A # print # print hmm.B # print hmm.pi # print "HMM Created successfully" tlist = [] print "Loading previously stored RF model" RF = joblib.load('RFModel.pk') print "RF loaded successfully" # Feature Extractor FE = FeatureExtractor() FE.set_class(1) # Open socket and start listening for data s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind((HOST, PORT)) s.listen(1) subactivities = get_subactivities() activities = get_activities() conn, addr = s.accept() while True: data = conn.recv(20000) #print data if data: start = time.clock() #print data data = data.split('\n') # Remove extra newline del data[-1] csv_data = [] for d in data: #print len(d) if len(d) >= 53: d2 = d[14:].strip('"') d3 = np.fromstring(d2, dtype=int, sep=',') csv_data.append(d3.tolist()) #print len(csv_data) if len(csv_data) > 0: csv_data = np.array(csv_data) #row,col = csv_data.shape #print row, col features = [] for x in range(4): f = FE.get_featurelist_from_nparray(csv_data, 2*x+1, 2*x + 2) for y in f: row,col = y.shape p = RF.predict(y[:col-2])[0] if len(history[x]) >= HISTORY_LEN: history[x].pop(0) history_labels[x].pop(0) history[x].append(p) history_labels[x].append(subactivities[int(p)]) for sub in subactivities: print sub, get_subactivity_class(sub) elapsed = (time.clock() - start) tlist.append(elapsed) print tlist
def main() : print "\nFetching Emails...\n" #gui.interfaceFetchEmails() print "Extracting Features...\n" fe.extractFeatures()
def main(args): print(args) train = load_from_tsv(args.train_file) dev = load_from_tsv(args.dev_file) test = load_from_tsv(args.test_file) print("ntrain: %d, ndev: %d, ntest: %d" % (len(train), len(dev), len(test))) classnames = list(set(map(lambda tweet: tweet.label, train))) window_size = 20 print("computing features (train)") fx = FeatureExtractor(["word2vec"], word2vec_model=args.word2vec_model) train_y = [] train_x = [] for tw in train: label = classnames.index(tw.label) m1,m2 = fx.process_word2vec_noagg(tw, window_size) train_x.append(m1) train_y.append(label) if m2 is not None: train_x.append(m2) train_y.append(label) train_y = np.asarray(train_y) train_x = np.asarray(train_x) print("train_x: ", train_x.shape) print("computing features (dev)") dev_y = [] dev_x = [] for tw in dev: label = classnames.index(tw.label) m1,m2 = fx.process_word2vec_noagg(tw, window_size) dev_x.append(m1) dev_y.append(label) if m2 is not None: dev_x.append(m2) dev_y.append(label) dev_y = np.asarray(dev_y) dev_x = np.asarray(dev_x) print("computing features (test)") test_y = [] test_x = [] for tw in test: label = classnames.index(tw.label) m1,m2 = fx.process_word2vec_noagg(tw, window_size) test_x.append(m1) test_y.append(label) if m2 is not None: test_x.append(m2) test_y.append(label) test_y = np.asarray(test_y) test_x = np.asarray(test_x) print("done") nclasses = len(classnames) train_data = (train_x, train_y) dev_data = (dev_x, dev_y) test_data = (test_x, test_y) neural_net.train_cnn(train_data, dev_data, test_data, nclasses, window_size=window_size)
def main(): ef.login() fe.extractFeatures()
__author__ = 'hafiz' import numpy as np from FeatureExtractor import * from drawfigure import * from Filter.datafilter import * feobj = FeatureExtractor() def get_actual_velocity(frames): v = (50* frames * 3)/(3.14*58) return v def check_fft(y): import matplotlib.pyplot as plt t = np.arange(len(y)) sp = np.fft.fft(y) freq = np.fft.fftfreq(t.shape[-1]) plt.plot(freq, sp.real, freq, sp.imag) plt.show() def check_fft1(y): from pylab import plot, show, title, xlabel, ylabel, subplot from scipy import fft, arange import numpy as np import pylab as pl rate = 200.0 t = np.arange(0, len(y), 1/rate) x = np.sin(2*np.pi*4*t) + np.sin(2*np.pi*7*t) + np.random.randn(len(t))*0.2
def validate(doc, name): f= open(name,"w") frequencies0 = FeatureExtractor.frequency(doc[:2]) #frequency count smoothed by 1 frequencies1 = FeatureExtractor.augmented_frequency(frequencies0) # augmented frequencies taking into account document size frequencies = FeatureExtractor.idf(frequencies1) # idfs total = frequencies["**prob**"] totals = sum(total) priors =[0.33, 0.33, 0.328] # based on number of documents a = ["C2","IKEA_EN","IKEA_IT"] correct = 0 number = 0 tpC2 = 0 tpIKEA_EN = 0 tpIKEA_IT = 0 fpC2 = 0 fpIKEA_EN = 0 fpIKEA_IT = 0 fnC2 = 0 fnIKEA_EN = 0 fnIKEA_IT = 0 with open(doc[2],"r") as mefile: for line in mefile: lines = line.split('\t') ID = lines[1] words = lines[4].replace("<s>","").replace("</s>","").split(" ") pC2 = 0 pIKEA_IT = 0 pIKEA_EN = 0 for word in words: if word in frequencies: pC2 += math.log((frequencies[word][0])) pIKEA_EN += math.log((frequencies[word][1])) pIKEA_IT += math.log((frequencies[word][2])) else: pC2 += math.log(0.5) pIKEA_EN += math.log(0.5) pIKEA_IT += math.log(0.5) b = [pC2+math.log(priors[0]),pIKEA_EN+math.log(priors[1]),pIKEA_IT+math.log(priors[2])] # other possibilities # d = [pC2,pIKEA_EN,pIKEA_IT] # without priors # c = [-pC2*priors[0],-pIKEA_EN*priors[1],-pIKEA_IT*priors[2]] # multiplying by priors proposal = a[b.index(max(b))] f.write(proposal + "\t" + ID + "\n") # calculate precision, recall, f1 # count true positives, false positives, false negatives print proposal if ID == proposal: if ID == "C2": tpC2+=1 elif ID == "IKEA_EN": tpIKEA_EN +=1 elif ID == "IKEA_IT": tpIKEA_IT += 1 correct += 1 else: if ID == "C2": fnC2+=1 elif ID == "IKEA_EN": fnIKEA_EN +=1 elif ID == "IKEA_IT": fnIKEA_IT += 1 if proposal == "C2": fpC2+=1 elif proposal == "IKEA_EN": fpIKEA_EN +=1 elif proposal == "IKEA_IT": fpIKEA_IT += 1 number +=1 print fnC2 precisionC2 = tpC2 / ( tpC2 + fpC2 ) precisionIKEA_IT = tpIKEA_IT / ( tpIKEA_IT + fpIKEA_IT) precisionIKEA_EN = tpIKEA_EN / ( tpIKEA_EN + fpIKEA_EN) precisions = [precisionC2, precisionIKEA_EN,precisionIKEA_IT] recallC2 = tpC2 / ( tpC2 + fnC2 ) recallIKEA_IT = tpIKEA_IT / ( tpIKEA_IT + fnIKEA_IT) recallIKEA_EN = tpIKEA_EN / ( tpIKEA_EN + fnIKEA_EN) recalls = [recallC2,recallIKEA_EN,recallIKEA_IT] avgpre = sum(precisions)/3 avgrec = sum(recalls)/3 f.write("\n\ncorrect: " + str(correct) + "out of" + str(number)) f.write("\nprecision: " + str(avgpre)) f.write("\nrecall: " + str(avgrec)) f.write("\nF1: " + str( 2* ((avgpre*avgrec) / (avgpre + avgrec)) )) f.close()
__author__ = 'hafiz' from FeatureExtractor import * from RF import * from subactivities import * import shutil if __name__=='__main__': fnam = "collection7-30" activity_name = get_activities() fobj = FeatureExtractor() framelenghts=[200,500,1000,1500,2000,2500,3000,3500,4000,4500,5000] fp = open("Data/"+fnam+"/"+activity_name[0]+".csv", 'w') for fln in framelenghts: sys.argv = ["FeatureExtractor.py", "Data/"+fnam+"/train" ,"train"] stp = int(fln/2) fobj.set_frameslength(fln,stp) fobj.main() sys.argv= ["python FeatureExtractor.py", "Data/"+fnam+"/eval", "test"] fobj.main() rfObj = RFClassifier() # rfObj.main() r = str(fln) rs = [] rs = rfObj.main_fusion() for a in rs: r=r+","+str(a)
def predictor(model,authorId,summary): featureExtractor = FeatureExtractor() featureExtractor.authorFinder(authorId) featureExtractor.similarityFinder(summary) return model.predict(featureExtractor.X)
def splitFile(md, splitBySentence): rf = open(md.filename, "r") # skip first 6 lines since they aren't important talk = "\n".join(rf.readlines()[6:]) # remove the Audio: Laughing and the applause talk = talk.replace("(Applause)", "") talk = talk.replace("(Audio: Laughing)", "") talk = talk.replace("-- (Laughter) --", " (Laughter) ") talk = talk.replace("-- (Laughter) ", " (Laughter) ") # remove hyphens for clarity talk = talk.replace("-", " ") # grab the frequency distribution talkFrequency = nltk.FreqDist(word_tokenize(talk.lower())) hapaxes = talkFrequency.hapaxes() hapaxes = [w for w in hapaxes if w.isalpha()] chunks = [] if splitBySentence: chunks = sent_tokenize(talk) # the talk turned into sentences else: chunks = talk.split("\n") # the talk is split by paragraph passedChunks = [] # the pure (laughs removed) chunks passed passedWords = [["TS", "TS", "TS"]] # prev chunks broken into stemmed/CC wds passedPOS = [] # passed chunks broken into POS numChunks = len(chunks) # the number of chunks in the talk chunksSinceLastLaugh = 0 # the number of chunks since last laugh laughCount = 0 # the laughs counted so far positives = [] # all of the positives negatives = [] # all of the negatives previousSentiment = {"Polarity": 0} # the previous chunk's sentiment for i in range(numChunks): # create a FeatureCollection for each chunk features = FeatureCollection.FeatureCollection(md.name) # if there is laughter in the chunk and it's not at the beginning OR # it is at the start of the next (if there is one) chunk if ("(Laughter)" in chunks[i][3:]) or (i != numChunks - 1 and ("(Laughter)" in chunks[i + 1][:12])): features.positive = True else: features.positive = False # remove the laughter from the chunk curChunk = chunks[i].replace("(Laughter)", "") features.chunk = curChunk # end here if there is nothing in the chunk besides laughter if len(curChunk) > 1: # increase the distance since the last laugh chunksSinceLastLaugh += 1 features.features["depth"] = i / numChunks # Depth features.features["laughsUntilNow"] = laughCount # Laugh Count Before This features.features["chunksSinceLaugh"] = chunksSinceLastLaugh # Chunks since lastlaugh # put this chunk at the end of the passed chunks list passedChunks.append(curChunk) # get sentiment features.sentimentFeats = FeatureExtractor.getSentiment(curChunk, previousSentiment) previousSentiment = features.sentimentFeats features.sentimentFeats["swearing"] = False # analyze sentence structure # (maxD, maxST, avgDepth, avgSubTrees) = getSubtreeFeatures(curChunk) # features["max_depth"] = maxD # features["max_subtree"] = maxST # features["avg_depth"] = avgDepth # features["avg_subtree"] = avgSubTrees # get Parts of speech features words = word_tokenize(curChunk) (_, pos) = FeatureExtractor.getPOS(words) passedPOS.append(pos) features.POS = pos # get length of chunk features.features["length"] = len(words) # replace quantites and years and some person names entity_chunk = entity_recognition(curChunk) features.features["statistic_count"] = entity_chunk.count("__Statistic__") # tokenize the words curChunk = curChunk.lower() words = word_tokenize(curChunk) talklen = len(words) # store the word vector if splitBySentence: features.wordVector = [[w for w in words if w not in stop and w.isalpha()]] else: for s in sent_tokenize(curChunk): wordlist = [w for w in word_tokenize(s) if w not in stop and w.isalpha()] features.wordVector.append(wordlist) # set the last 3 words features.prev3Words = passedWords[-1][-3:] # case collapse and stem words, get the variance, and hapax count variousWords = {} numHapax = 0 for j in range(talklen): # move following line below if variance by stemmed words variousWords[words[j]] = True if words[j] in hapaxes: numHapax += 1 if words[j] in swears: features.sentimentFeats["swearing"] = True # words[j] = "SWEARWORD" # seems to have lowered accuracy words[j] = stemmer.stem(words[j]) features.features["hapax_count"] = numHapax # calculate word variance if talklen > 0: features.wordVariance = len(variousWords) / talklen else: features.wordVariance = 0 # get words as features passedWords.append((words + ["EOS"])) features.words = [w for w in words if w not in stop] # word_tokenize(curChunk) # if the chunk was positive thenwe need to: # reset the distance since last laugh and increment laugh count if features.positive: laughCount += 1 chunksSinceLastLaugh = 0 positives.append(features) else: negatives.append(features) rf.close return [positives, negatives]
Example: python BatchFeatureExtractor.py -i semcor/semcor3.0 -o semcor_features -f l-3_l+1_t-1_t+2_tw_twt''') parser.add_argument('-i', '--input', help='Input file name', required=True) parser.add_argument('-o', '--output', help='Output file name', required=True) parser.add_argument('-f', '--features', help='Feature names', required=True) args = parser.parse_args() ## show values ## srcdir = args.input srcdirpath = os.path.abspath(srcdir) dstdir = args.output dstdirpath = os.path.abspath(dstdir) featurenames = args.features retout = "" if not os.path.exists(dstdirpath): os.makedirs(dstdirpath) if os.path.exists(srcdirpath): retout = __process__(srcdirpath) else: print("The source directory is not exist!") else: print("The destination directory is already exist!") print("Extracting the features ...") FeatureExtractor.__featureextractor__(retout, dstdirpath, featurenames)
from FeatureExtractor import * from QClassifier import * from numpy import asarray if __name__ == '__main__': logging.basicConfig(level = logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') logging.info('start to extract features') extractor = FeatureExtractor() extractor.load(path = '../data/pair.xml') features = extractor.extract_features() labels = extractor.get_labels() assert(len(labels) == len(features)) logging.info('split data into training data & test data') train_percentage = 0.8 mid = int(len(features) * (1 - train_percentage)) test_x, train_x = features[:mid], features[mid:] test_y, train_y = labels[:mid], labels[mid:] vectorizer = FeatureHasher(input_type = 'string', non_negative = True) train_x = vectorizer.transform(train_x) test_x = vectorizer.transform(test_x) # train_x = vectorizer.transform(train_x) # test_x = vectorizer.transform(test_x)