def generate_positives(frame,
                       bbox,
                       max_shift=0.08,
                       max_scale=0.08,
                       nparray=True):
    bbox = (int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))
    positives = []
    # roi = frame[bbox[1]:bbox[1]+bbox[3],bbox[0]:bbox[0]+bbox[2]]
    positives.append(FeatureExtractor.extract_hog_hsv(frame, bbox))
    # shift in 4 directions
    shift_samples = bboxShift(frame, bbox, max_shift=max_shift)
    for newbox in shift_samples:
        positives.append(FeatureExtractor.extract_hog_hsv(frame, newbox))
    # flip horizontally
    roi = frame[bbox[1]:bbox[1] + bbox[3], bbox[0]:bbox[0] + bbox[2]]
    positives.append(FeatureExtractor.extract_hog_hsv(cv2.flip(roi, 1)))
    positives.append(
        FeatureExtractor.extract_hog_hsv(cv2.GaussianBlur(roi, (5, 5), 15)))
    # 3 scaled samples
    scale_samples = bboxScale(frame, bbox, max_scale=max_scale)
    for newbox in scale_samples:
        positives.append(FeatureExtractor.extract_hog_hsv(frame, newbox))
    if nparray:
        return np.array(positives)
    else:
        return positives
Example #2
0
def extractFeatures():
    samples = DataSample.objects.filter(datafeature=None)
    fs = FileSystemStorage()
    for i in samples:
        ftype = i.dataType
        print ftype
        if ftype == 'audio':
            [mfcc_path, diff_path,
             stft_path] = fe.extractAudioFeatures(fs.path(i.dataFile.name))
            with open(mfcc_path) as ff:
                f1 = DataFeature(dataSample=i,
                                 featureType='audio-mfcc',
                                 featureFile=File(ff))
                f1.save()
            with open(diff_path) as ff:
                f1 = DataFeature(dataSample=i,
                                 featureType='audio-mfcc-diff',
                                 featureFile=File(ff))
                f1.save()
            with open(stft_path) as ff:
                f2 = DataFeature(dataSample=i,
                                 featureType='audio-stft',
                                 featureFile=File(ff))
                f2.save()
        if ftype == 'image':
            file_paths = fe.extractImageFeatures([fs.path(i.dataFile.name)])
Example #3
0
 def __init__(self):
     self.wi = WebInterface()
     self.fe = FeatureExtractor("")
     self.price = ''
     self.purl = ''
     self.pid = 0
     self.modeset = 'accurate'
Example #4
0
    def __init__(self, featureList, parameters):
        self.featureList = featureList
        self.parameters = parameters
        self.svc = None
        self.scaler = None

        self.featureExtractor = FeatureExtractor(self.parameters,
                                                 self.featureList)

        self.carData = None
        self.notCarData = None
        self.carPath = [
            "training_images\\vehicles\\vehicles\\GTI_Far",
            "training_images\\vehicles\\vehicles\\GTI_Left",
            "training_images\\vehicles\\vehicles\\GTI_MiddleClose",
            "training_images\\vehicles\\vehicles\\GTI_Right",
            "training_images\\vehicles\\vehicles\\KITTI_extracted"
        ]
        self.notCarPath = [
            "training_images\\non-vehicles\\non-vehicles\\Extras",
            "training_images\\non-vehicles\\non-vehicles\\GTI",
        ]

        self.carFeatures = None
        self.notCarFeatures = None
Example #5
0
 def run_extra_feature_extractor(self):
     config = self.CONFIG
     body_text_dir = config['body_text_dir']
     extra_feature_dir = config['extra_feature_dir']
     print 'begin run extra feature extractor from %s' % body_text_dir
     print 'Save extra features to %s' % extra_feature_dir
     FeatureExtractor.run_extra(body_text_dir, extra_feature_dir, config)
     print 'finish save extra features to %s' % extra_feature_dir
     return None
Example #6
0
 def run_feature_extractor(self):
     config = self.CONFIG
     body_text_dir = config['body_text_dir']
     feature_dir = config['feature_dir']
     print 'begin run feature extractor from %s' % body_text_dir
     print 'Save features to %s' % feature_dir
     FeatureExtractor.run(body_text_dir, feature_dir)
     print 'finish save features to %s' % feature_dir
     return None
Example #7
0
    def doClassification(self):

        if len(self.testData) > 0:
            print(__name__ + '\tTesting Data Shape : ' +
                  str(self.testData.shape))
            self.testData = FeatureExtractor.preprocessData(self.testData)
            self.testFeatures = FeatureExtractor.getFeatures(self.testData)
            print(__name__ + '\tTest Features extracted')
            self.classify()
Example #8
0
 def run_feature_extractor(self):
     config = self.CONFIG
     body_text_dir = config['body_text_dir']
     feature_dir = config['feature_dir']
     print 'begin run feature extractor from %s' % body_text_dir
     print 'Save features to %s' % feature_dir
     FeatureExtractor.run(body_text_dir, feature_dir)
     print 'finish save features to %s' % feature_dir
     return None
Example #9
0
def extractFeaturesML(db_name, DIR):
    key_users = ['UserId']
    key_posts = ['PostId']
    # extracts all features
    extractor.extractForML(db_name, DIR)
    dfML = mf.mergeAll(DIR, key_users, key_posts, FeaturePredict)
    dfML.to_csv(DIR + 'dataML.csv', index=False)
    dfnorm = dp.normalize(dfML)
    return dfnorm
Example #10
0
 def run_extra_feature_extractor(self):
     config = self.CONFIG
     body_text_dir = config['body_text_dir']
     extra_feature_dir = config['extra_feature_dir']
     print 'begin run extra feature extractor from %s' % body_text_dir
     print 'Save extra features to %s' % extra_feature_dir
     FeatureExtractor.run_extra(body_text_dir, extra_feature_dir, config)
     print 'finish save extra features to %s' % extra_feature_dir
     return None
Example #11
0
def extractFeaturesML(db_name, DIR):
    key_users = ['UserId']
    key_posts = ['PostId']
    # extracts all features
    extractor.extractForML(db_name, DIR)
    dfML = mf.mergeAll(DIR, key_users, key_posts, FeaturePredict)
    dfML.to_csv(DIR + 'dataML.csv', index=False)
    dfnorm = dp.normalize(dfML)
    return dfnorm
Example #12
0
    def __init__(self, featureList, parameters, svc, scaler):
        self.featureList = featureList
        self.parameters = parameters
        self.svc = svc
        self.scaler = scaler

        self.featureExtractor = FeatureExtractor(self.parameters,
                                                 self.featureList)
        self.heatmapHistory = collections.deque(
            maxlen=parameters.SearchSettings.averages)
Example #13
0
def extractLayerFeat(idx_ls, scale_size=224):
    extractor = FeatureExtractor(cache_folder=model_cache_folder,
                                 which_net='vgg16',
                                 which_layer=VC['layer'],
                                 which_snapshot=0)
    '''
    assert(os.path.isfile(Dict['Dictionary']))
    with open(Dict['Dictionary'], 'rb') as fh:
        _,centers,_ = pickle.load(fh)
    '''
    for ii in idx_ls:
        file_list = Dataset['file_list'].format(ii)
        with open(file_list, 'r') as fh:
            content = fh.readlines()

        img_list = [x.strip() for x in content]
        img_num = len(img_list)
        print('total number of images for idx {1}: {0}'.format(img_num, ii))

        img_list = img_list[0:3000]
        img_num = len(img_list)
        print('used number of images for idx {1}: {0}'.format(img_num, ii))

        feat_set = [None for nn in range(img_num)]
        for nn in range(img_num):
            file_img = os.path.join(Dataset['img_dir'], img_list[nn])
            assert (os.path.isfile(file_img))
            img = cv2.imread(file_img)
            patch = myresize(img, scale_size, 'short')

            layer_feature = extractor.extract_feature_image(patch)[0]
            assert (featDim == layer_feature.shape[2])
            feat_set[nn] = layer_feature
            '''
            iheight, iwidth = layer_feature.shape[0:2]
            layer_feature = layer_feature.reshape(-1, featDim)
            feat_norm = np.sqrt(np.sum(layer_feature**2, 1)).reshape(-1,1)
            layer_feature = layer_feature/feat_norm
            
            dist = cdist(layer_feature, centers, 'cosine').reshape(iheight,iwidth,-1)
            assert(dist.shape[2]==centers.shape[0]);
            r_set[nn] = dist
            '''
            if nn % 100 == 0:
                print(nn, end=' ')
                sys.stdout.flush()

        print('\n')

        file_cache_feat = os.path.join(
            Feat['cache_dir'], '{0}_feat_{1}.pickle'.format(VC['layer'], ii))
        with open(file_cache_feat, 'wb') as fh:
            pickle.dump(feat_set, fh)
Example #14
0
def append_vectors_to_t(t):
    #Get file names
    articlesDirectory = "../Articles/"
    fileNames = FileUtil.getFileNames(articlesDirectory)

    #Create feature Vectors
    numOfFeatureVectors = len(fileNames)
    for file in fileNames:
        # fileArg is set to directory of each files
        # fVector is feature vector of each file then it is written to file
        fileArg = articlesDirectory + file
        fVector = FeatureExtractor.createFeatureVector(fileArg)
        fVectorNormalized = FeatureExtractor.normalize(fVector)
        t.append(fVectorNormalized) #Append feature vectors to t.
def compute_features(config, mp=False):
    import FeatureExtractor
    import movieqa_importer
    feature_extractor = FeatureExtractor.FeatureExtractor(config['video_features'])
    mqa = movieqa_importer.MovieQA.DataLoader()
    vl_qa, _ = mqa.get_video_list('full', 'all_clips')
    desc = config['video_features']
    document_type = 'video_clips'
    check_save_directory(filename=utils.DOC_DESC_TEMPLATE %(document_type, desc, ''))
    videos = []
    outputs = []
    for i, imdb_key in enumerate(vl_qa.keys()):
        current_videos = map(lambda x: os.path.join('/cs/vml4/smuralid/datasets/movieqa/MovieQA_benchmark','story', document_type, imdb_key, x), vl_qa[imdb_key])
        videos.extend(current_videos)
        output = map(lambda x: os.path.join((utils.DOC_DESC_TEMPLATE % (document_type, desc, imdb_key))[:-4], x.replace('.mp4', '')), vl_qa[imdb_key])
        outputs.extend(output)
    if mp:
        inputs = [[x, y] for x,y in zip(videos, outputs)]
        from random import shuffle
        shuffle(inputs)
        #pool = Pool(mp)
        #pool.map(feature_extractor.extract_features, inputs)
        for x in inputs: feature_extractor.extract_features(x)
    else:
        for video, output in zip(videos, outputs):
            feature_extractor.extract_features([video, output])
Example #16
0
def init_static_dialog_agent(args) :
    print "reading in Ontology"
    ont = Ontology.Ontology(sys.argv[1])
    print "predicates: " + str(ont.preds)
    print "types: " + str(ont.types)
    print "entries: " + str(ont.entries)

    print "reading in Lexicon"
    lex = Lexicon.Lexicon(ont, sys.argv[2])
    print "surface forms: " + str(lex.surface_forms)
    print "categories: " + str(lex.categories)
    print "semantic forms: " + str(lex.semantic_forms)
    print "entries: " + str(lex.entries)

    print "instantiating Feature Extractor"
    f_extractor = FeatureExtractor.FeatureExtractor(ont, lex)

    print "instantiating Linear Learner"
    learner = LinearLearner.LinearLearner(ont, lex, f_extractor)

    print "instantiating KBGrounder"
    grounder = KBGrounder.KBGrounder(ont)

    load_parser_from_file = False
    if len(args) > 4 :
        if args[4].lower() == 'true' :
            load_parser_from_file = True
            
    if load_parser_from_file :
        parser = load_model('static_parser')
        grounder.parser = parser
        grounder.ontology = parser.ontology
    else :
        print "instantiating Parser"
        parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10, safety=True)

    print "instantiating Generator"
    generator = Generator.Generator(ont, lex, learner, parser, beam_width=sys.maxint, safety=True)

    print "instantiating DialogAgent"
    static_policy = StaticDialogPolicy.StaticDialogPolicy()
    A = StaticDialogAgent(parser, generator, grounder, static_policy, None, None)

    if not load_parser_from_file :
        print "reading in training data"
        D = A.read_in_utterance_action_pairs(args[3])

        if len(args) > 4 and args[4] == "both":
            print "training parser and generator jointly from actions"
            converged = A.jointly_train_parser_and_generator_from_utterance_action_pairs(
                D, epochs=10, parse_beam=30, generator_beam=10)
        else:
            print "training parser from actions"
            converged = A.train_parser_from_utterance_action_pairs(
                D, epochs=10, parse_beam=30)

        print "theta: "+str(parser.learner.theta)
        save_model(parser, 'static_parser')
    
    return A
Example #17
0
 def rate(self, board):
     #   neural network will go here
     if self.net == None:
         return random.random()
     else:
         return self.net.predict([np.array(fe.extract_features(board))])[0,
                                                                         0]
def positive_score(frame, bbox, classifier='svm'):
    features = FeatureExtractor.extract_hog_hsv(frame, bbox)
    if classifier == 'adaboost':
        return bdt.score(np.array([features]), [0])
    #return clf.score(np.array([features]), [0]
    score = clf.predict_proba([features])
    #print(score)
    return score[0][0]
Example #19
0
def upload_agent():
    if request.method == 'POST':
        
        #filez = request.files['file']
        #print(filez.filename, file=sys.stderr)
        #extension = os.path.splitext(filez.filename)[1]
        #f_name = str(uuid.uuid4()) + extension
        #filez.save(os.path.join('./', f_name))
        #print(request.data)
        #print(request.form)
        #print(request.args)
        
        data_dict = dict(request.form)
        #print(data_dict.keys())
        
        #terrible!
        jdata = data_dict['data'] #this is what we get in here a form with data
        myjson = jdata[0]
        #print(type(myjson))
        mydata = json.loads(myjson)['data'][0] 
        #print(mydata[0])
        
        #save the data...
        outputfilename = "./temp_agent.wav"
        with open(outputfilename, 'wb') as output:
            output.write(bytearray(map(lambda x: chr(x % 256), mydata)))
        
        #the emotion detector object to be used.
        global model
        global Rescaler
        
        file_features = FeatureExtractor.extract_features(outputfilename)
        file_features = Rescaler.transform(file_features)
        
        prediction= model.predict(file_features)
        #TYPO IN HAPPINESS !!!
        body={"mode":"sync",  "messageType":"70281a5b78eba98c2e2c", "messages":[{"Anger":round(prediction[0][0],2), "Disgust":round(prediction[0][1],2), "Fear":round(prediction[0][2],2), "Hapiness":round(prediction[0][3],2),"Neutral":round(prediction[0][4],2),  "Sadness":round(prediction[0][5],2), "Surprise":round(prediction[0][6],2) }]}
    
        #body='{"mode":"sync", "messageType":"70281a5b78eba98c2e2c", "messages":[{"Anger":0.0, "Disgust":0.0, "Fear":0.0, "Hapiness":0.1,"Neutral":0.1,  "Sadness":0.1, "Surprise":0.2 }]}'
        print('AGENT EMOTIONS')
        '''print(body)
        
        try:
            r = http.urlopen('POST', url, body=str(body), headers=headers)# for the engineer coming after me, screw you !
            print(r.status)
            print(r.data)
        
        except urllib3.exceptions.SSLError as e:
            print (e)
        
        print(prediction)
        '''
        
        #thread = SubmitterTDR(1,body)
        #thread.start()
        
        #an array of probabilities...
        return json.dumps({'predictions': prediction.tolist()[0]})
Example #20
0
def extractFeatures():
    emails = fe.read_from_disk(INBOX_DIRECTORY, 2)
    inboxEmails = []
    # seprate sent and inbox emails
    for email in emails:
        email = fe.parseEmail(email)
        inboxEmails.append(email)
    inboxEmailFeatures = []
    output = []

    # find numerical features
    for email in inboxEmails:
        features = {}
        features["sender_frequency"] = fe.senderFrequency(
            email["From"], inboxEmails)
        features["is_automated_mail"] = fe.words_present(
            email["emailText"], fe.negative_words)
        if(email["Subject"]):
            features["is_interrogative_text"] = fe.words_present(
                email["emailText"] + email["Subject"], fe.interrogative_words)
        else:
            features["is_interrogative_text"] = fe.words_present(
                email["emailText"], fe.interrogative_words)
        features["Cc"] = email["Cc"]
        # features["reply"] = replied(email["Message-ID"],sentEmails)
        featureslist = fe.dictList(features)
        inboxEmailFeatures.append(featureslist)
        # print("Mail Text: ", email['emailText'])
        return inboxEmailFeatures, email['emailText']
Example #21
0
def test(doc, name):

	f= open(name,"w")

	frequencies0 = FeatureExtractor.frequency(doc[:2],True,True) #frequency count smoothed by 1
	frequencies1 = FeatureExtractor.augmented_frequency(frequencies0) # augmented frequencies taking into account document size
	frequencies = FeatureExtractor.idf(frequencies1) # idfs
	total = frequencies["**prob**"]
	totals = sum(total)

	priors =[0.33, 0.33, 0.329] # based on number of documents

	a = ["C2","IKEA_EN","IKEA_IT"]


	with open(doc[2],"r") as mefile:

		for line in mefile:
			lines = line.split('\t')
			ID = lines[0]
			words = lines[4].replace("<s>","").replace("</s>","").split(" ")
			pC2 = 0
			pIKEA_IT = 0
			pIKEA_EN = 0
			for word in words:
				
				if word in frequencies:
					pC2 += math.log((frequencies[word][0]))
					pIKEA_EN += math.log((frequencies[word][1]))
					pIKEA_IT += math.log((frequencies[word][2]))

				else:
					pC2 += math.log(0.5)
					pIKEA_EN += math.log(0.5)
					pIKEA_IT += math.log(0.5)
			
			b = [pC2+math.log(priors[0]),pIKEA_EN+math.log(priors[1]),pIKEA_IT+math.log(priors[2])]
			
			

			proposal = a[b.index(max(b))]
			f.write(ID+ "\t" + proposal + "\n")


	f.close()
Example #22
0
class PriceCheck(object):
    def __init__(self):
        self.wi = WebInterface()
        self.fe = FeatureExtractor("")
        self.price = ''
        self.purl = ''
        self.pid = 0
        self.modeset = 'accurate'

    def set_new_mode(self, new_modeset):
        self.modeset = new_modeset

    def reinit(self, purl, pid):
        self.purl = purl
        self.pid = pid

    def deinit(self):
        self.fe.deinit()

    def call_wi_to_update_price(self):
        if self.price:
            self.wi.price_update(self.pid, self.price)

    def get_price(self):
        self.fe.reinit(self.purl)
        self.fe.set_mode(self.modeset)  #'accurate')
        self.fe.run()
        self.price = self.fe.price
        self.sale_price = self.fe.sale_price
        self.regular_price = self.fe.regular_price
        self.was_price = self.fe.was_price

    def single_run(self):
        products = self.wi.get_prod_list_for_price_update()
        if not products:
            print "No more products to check price for."
            return
        for prod in products:
            if 'Product' in prod:
                purl = prod['Product']['purl']
                pid = prod['Product']['id']
                print "Getting new price for : " + purl
                print "Pid : " + pid
                self.reinit(purl, pid)
                self.get_price()
                print "New price :" + str(self.price)
                self.call_wi_to_update_price()
                print "---"

    def run(self):
        while True:
            self.single_run()
            print "...zzz...5 mins...zzz..."
            time.sleep(300)
Example #23
0
def getAverageVector(author):
    x = []

    articlesDirectory = "../Articles/"
    fileNames = FileUtil.getFileNames(articlesDirectory)

    for file in fileNames:
        if (author in file):
            fileArg = articlesDirectory + file
            fVector = FeatureExtractor.createFeatureVector(fileArg)
            fVectorNormalized = FeatureExtractor.normalize(fVector)
            x.append(fVector)

    sumElement = 0
    avgVector = []
    for i in range(len(x[0])):
        for j in range(len(x)):
            sumElement += x[j][i]
        avgVector.append(sumElement/len(x[0]))
    return avgVector
def predict(frame, bbox, classifier="svm"):
    bbox = (int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))
    features = FeatureExtractor.extract_hog_hsv(frame, bbox)
    if classifier == 'adaboost':
        pred = bdt.predict(np.array([features]))
    else:
        pred = clf.predict(np.array([features]))
    #print(pred)
    if pred[0] == 0:
        return True
    return False
Example #25
0
def run_BOW_baseline(args):
    print(args)
    train, dev, test = load_datasets(args)
    classnames = list(set(map(lambda tweet: tweet.label, train)))
    train_y = map(lambda tweet: classnames.index(tweet.label), train)
    dev_y = map(lambda tweet: classnames.index(tweet.label), dev)
    test_y = map(lambda tweet: classnames.index(tweet.label), test)
    # fx = FeatureExtractor(["BOW"], stopwords=args.stopwords)
    fx = FeatureExtractor(["hand-coded"], stopwords=args.stopwords)
    fx.build_vocab(train)
    train_x = np.asarray(map(lambda tweet: fx.process(tweet), train))
    check = train_x[0]
    print("sample fv shape: ", check.shape)
    dev_x = np.asarray(map(lambda tweet: fx.process(tweet), dev))
    test_x = np.asarray(map(lambda tweet: fx.process(tweet), test))
    nclasses = len(classnames)
    ntrain = train_x.shape[0]
    nbatches = 100
    batch_size = ntrain/nbatches
    train_data = (train_x, train_y)
    dev_data = (dev_x, dev_y)
    test_data = (test_x, test_y)
    neural_net.logistic_regression_optimization_sgd(train_data, dev_data, test_data, nclasses, batch_size=batch_size)
    print("train set performance:")
    train_ypred = neural_net.predict(train_x, train_y)
    print(evaluate.ConfusionMatrix(train_y, train_ypred, classnames))
    print("validation set performance:")
    dev_ypred = neural_net.predict(dev_x, dev_y)
    print(evaluate.ConfusionMatrix(dev_y, dev_ypred, classnames))
    print("test set performance:")
    test_ypred = neural_net.predict(test_x, test_y)
    print(evaluate.ConfusionMatrix(test_y, test_ypred, classnames))
Example #26
0
def init_pomdp_dialog_agent(args) :
    print "Reading in Ontology"
    ont = Ontology.Ontology(args[1])
    print "predicates: " + str(ont.preds)
    print "types: " + str(ont.types)
    print "entries: " + str(ont.entries)

    print "Reading in Lexicon"
    lex = Lexicon.Lexicon(ont, args[2])
    print "surface forms: " + str(lex.surface_forms)
    print "categories: " + str(lex.categories)
    print "semantic forms: " + str(lex.semantic_forms)
    print "entries: " + str(lex.entries)

    print "Instantiating Feature Extractor"
    f_extractor = FeatureExtractor.FeatureExtractor(ont, lex)

    print "Instantiating Linear Learner"
    learner = LinearLearner.LinearLearner(ont, lex, f_extractor)

    print "Instantiating KBGrounder"
    grounder = KBGrounder.KBGrounder(ont)

    load_models_from_file = False
    if len(args) > 4 :
        if args[4].lower() == 'true' :
            load_models_from_file = True

    if load_models_from_file :
        parser = load_model('pomdp_parser')
        grounder.parser = parser
        grounder.ontology = parser.ontology
    else :
        print "Instantiating Parser"
        parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10)

    print "Instantiating DialogAgent"
    if load_models_from_file :
        agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=True)
    else :
        agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=False)

    if not load_models_from_file :
        print "reading in data and training parser from actions"
        D = agent.read_in_utterance_action_pairs(args[3])
        converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30)
        print "theta: "+str(parser.learner.theta)
        save_model(parser, 'pomdp_parser')
        #print 'Parser ontology : ', parser.ontology.preds

    return agent
Example #27
0
def main():
    print("\nFetching Emails...\n")
    # gui.interfaceFetchEmails()
    username = "******"
    password = "******"
    # ef.login(username, password)

    print("Extracting Features...\n")
    clss = fe.extractFeatures()

    pr.login(username, password)
    status, msg = pr.predict(clss)
    print(msg)
    if (status):
        ts.pred_ans(msg)
Example #28
0
def run(is_changing_n=False, number_to_ngram=3, number_of_features=220, fs_method=FEATURE_SELECTION_MOST_COMMON,
        c_method=CLASSIFIER_ONE_CLASS_SVM):

    # set param for final calcs
    all_ans = []

    # Feature extraction
    out_0_file = Path(out_0_path)
    if is_changing_n:
        fe.export_to_csv_all_users(number_to_ngram)
    if not out_0_file.exists():
        fe.export_to_csv_all_users(number_to_ngram)

    for user_number in range(0, 5):
        # Feature selection
        FeatureSelection.select_features(number_of_features, fs_method, user_number)

        # Classifier
        ans = Classifier.classify(number_of_features, c_method)
        all_ans.append(ans)

    print("""
    ** FINAL SCORE : {} **
    """.format(sum(all_ans)/len(all_ans)))
Example #29
0
def run():
    ''' Establish directory paths for training and validation data '''
    ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
    sunny_train_dirpath = ROOT_DIR + '/data/c_sunny/'
    overcast_train_dirpath = ROOT_DIR + '/data/c_overcast/'
    sunny_test_dirpath = ROOT_DIR + '/data/3000_images_test/c_sunny/'
    overcast_test_dirpath = ROOT_DIR + '/data/3000_images_test/c_overcast/'
    ''' Perform feature extraction on training data '''
    sunny_train_c = [f for f in os.listdir(sunny_train_dirpath)]
    overcast_train_c = [f for f in os.listdir(overcast_train_dirpath)]
    train_feats = extractor.get_hog_hist_features(sunny_train_c,
                                                  overcast_train_c,
                                                  sunny_train_dirpath,
                                                  overcast_train_dirpath)
    ''' Perform feature extraction on validation data '''
    sunny_test_c = [f for f in os.listdir(sunny_test_dirpath)]
    overcast_test_c = [f for f in os.listdir(overcast_test_dirpath)]
    test_feats = extractor.get_hog_hist_features(sunny_test_c, overcast_test_c,
                                                 sunny_test_dirpath,
                                                 overcast_test_dirpath)
    ''' Naive Bayes Classifier '''
    train_feats = shuffle(train_feats)
    test_feats = shuffle(test_feats)
    X_train = train_feats.iloc[:, :len(train_feats.columns) - 1]
    y_train = train_feats['label']
    X_test = test_feats.iloc[:, :len(test_feats.columns) - 1]
    y_test = test_feats['label']
    clf_nb = GaussianNB()
    model = clf_nb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred, normalize=True)
    mat = confusion_matrix(y_test, y_pred)
    print(mat)
    print("Number of mislabeled points out of a total {0} points : {1}".format(
        len(X_test), (y_test != y_pred).sum()))
    print("Naive Bayes Classifier test accuracy = ", acc_score)
def generate_sets():
    people = os.listdir('Datasets')
    del people[people.index('Test')]
    X = []
    Y = []
    for person in people:
        samples = os.listdir('Datasets//' + person)
        X += [
            FeatureExtractor.extract_features('Datasets//{}//{}'.format(
                person, sample))[0] for sample in samples
        ]
        Y += [int('Alex' in person) for _ in range(len(samples))]

    time.sleep(1)
    X, Y = shuffle(X, Y, random_state=0)
    X = np.array(X)
    return X, Y
def train(trainingSet, subredditLabels, args):
    numIterations = 20
    eta = 0.05

    #dictionary of dictionaries (weights)
    weightDict = {}
    for label in subredditLabels:
        weightDict[label] = {}


    def gradLoss(phiX, w, y):
        score = util.dotProduct(w, phiX)
        margin = score * y
        if margin < 1:
            for name, feature in phiX.iteritems():
                phiX[name] = -1 * y * feature
            return phiX
        else:
            return 0


    for label in subredditLabels:
        trainingSet.seek(0)
        weightVector = weightDict[label]
        for i in range(numIterations):
            for example in trainingSet:
                example = json.loads(example)
                title = example['title']
                subreddit = example['subreddit']

                features = FeatureExtractor.extractFeatures(title, args)

                y = -1
                if label == subreddit:
                    y = 1

                grad = gradLoss(features, weightVector, y)

                if grad != 0:
                    util.increment(weightVector, -1 * eta, grad)
                    weightDict[label] = weightVector
                else:
                    weightDict[label] = weightVector

    return weightDict
Example #32
0
    def analyse(self, image, orig):
        image_features = FeatureExtractor.getFeatures(image)
        image_features = np.array([image_features])

        image_features = self.scaler.transform(image_features)
        confidence = np.amax(self.classifier.predict_proba(image_features))

        if confidence > 0.7:
            self.last_gesture = str([
                self.training_names[i]
                for i in self.classifier.predict(image_features)
            ][0])
            orig = self.writeLastGesture(orig)

        else:
            self.last_gesture = "Unsure"
            orig = self.writeLastGesture(orig)

        return image, orig, self.last_gesture
Example #33
0
def init_dialog_agent(args):
    print "Reading in Ontology"
    ont = Ontology.Ontology(args[1])
    print "predicates: " + str(ont.preds)
    print "types: " + str(ont.types)
    print "entries: " + str(ont.entries)

    print "Reading in Lexicon"
    lex = Lexicon.Lexicon(ont, args[2])
    print "surface forms: " + str(lex.surface_forms)
    print "categories: " + str(lex.categories)
    print "semantic forms: " + str(lex.semantic_forms)
    print "entries: " + str(lex.entries)

    print "Instantiating Feature Extractor"
    f_extractor = FeatureExtractor.FeatureExtractor(ont, lex)

    print "Instantiating Linear Learner"
    learner = LinearLearner.LinearLearner(ont, lex, f_extractor)

    print "Instantiating KBGrounder"
    grounder = KBGrounder.KBGrounder(ont)

    print "Instantiating Parser"
    parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10)
    parser = load_model('parser')
    grounder.parser = parser
    grounder.ontology = parser.ontology

    print "Instantiating DialogAgent"
    agent = PomdpDialogAgent(parser, grounder, None, None)

    #print "reading in data and training parser from actions"
    #D = agent.read_in_utterance_action_pairs(args[3])
    #converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30)
    #print "theta: "+str(parser.learner.theta)
    #save_model(parser, 'parser')
    #print 'Parser ontology : ', parser.ontology.preds

    return agent
def predict(weights, testSet, args):
    correct = 0
    incorrect = 0
    total = 0
    for data in testSet:
        data = json.loads(data)
        title = data['title']
        subreddit = data['subreddit']
        features = FeatureExtractor.extractFeatures(title, args)
        maxScore = float('-inf')
        prediction = ''

        for key in weights.keys():
            weightVector = weights[key]

            score = util.dotProduct(weightVector, features)
            if score > maxScore:
                prediction = key
                maxScore = score

        if prediction == subreddit:
            correct += 1
        else:
            if args.verbose:
                try:
                    print title
                    print "predicted: " + prediction.encode('utf-8')
                    print features
                    printRelevantWeights(weights, features)
                    print "-----------------"


                except UnicodeEncodeError:
                    print "error"
            incorrect += 1
        total += 1


    print 'accuracy ' + str(float(correct) / total)
    print 'wrong ' + str(float(incorrect) / total)
Example #35
0
def run_word2vec_baseline(args):
    print(args)
    print('subtask id: %s' % args.subtask_id)
    train, dev, test = load_datasets(args)

    classnames = list(set(map(lambda tweet: tweet.label, train)))
    train_y = map(lambda tweet: classnames.index(tweet.label), train)
    dev_y = map(lambda tweet: classnames.index(tweet.label), dev)
    test_y = map(lambda tweet: classnames.index(tweet.label), test)

    fx = FeatureExtractor(["word2vec"], word2vec_model=args.word2vec_model)
    fx.build_vocab(train)
    train_x = np.asarray(map(lambda tweet: fx.process(tweet), train))
    check = train_x[0]
    print("sample fv shape: ", check.shape)
    dev_x = np.asarray(map(lambda tweet: fx.process(tweet), dev))
    test_x = np.asarray(map(lambda tweet: fx.process(tweet), test))

    nclasses = len(classnames)
    ntrain = train_x.shape[0]
    nbatches = 100
    batch_size = ntrain / nbatches
    train_data = (train_x, train_y)
    dev_data = (dev_x, dev_y)
    test_data = (test_x, test_y)

    neural_net.logistic_regression_optimization_sgd(train_data,
                                                    dev_data,
                                                    test_data,
                                                    nclasses,
                                                    batch_size=batch_size)

    print("train set performance:")
    train_ypred = neural_net.predict(train_x, train_y)
    print(evaluate.ConfusionMatrix(train_y, train_ypred, classnames))
    print("validation set performance:")
    dev_ypred = neural_net.predict(dev_x, dev_y)
    print(evaluate.ConfusionMatrix(dev_y, dev_ypred, classnames))
    print("test set performance:")
    test_ypred = neural_net.predict(test_x, test_y)
    print(evaluate.ConfusionMatrix(test_y, test_ypred, classnames))
Example #36
0
    def __init__(self,Device = 1, Input = True, Channels = 2,  THRESHOLD = 500, CHUNK_SIZE = 1024, FORMAT = pyaudio.paInt16, RATE = 8000, RECORD_SECONDS = 3.25,WAVE_OUTPUT_FILENAME_EXTENSION = 0, EXPORT_FOLDER= "Recordings", WAVE_OUTPUT_FILENAME = "output", BASELINE = 'baseline_mean_sd.pickle', URL= "http://localhost:50000/annotate"):
        self.myqueue= deque([])
        self.Input = Input
        self.Device = Device
        self.Channels=Channels
        self.URL = URL

        if (Input==False):
            self.Output = True
            self.Channels = 0

        self.THRESHOLD = THRESHOLD
        self.EXPORT_FOLDER = EXPORT_FOLDER
        self.CHUNK_SIZE = CHUNK_SIZE
        self.FORMAT = FORMAT
        self.RATE = RATE
        self.RECORD_SECONDS = RECORD_SECONDS
        self.WAVE_OUTPUT_FILENAME_EXTENSION = WAVE_OUTPUT_FILENAME_EXTENSION
        self.WAVE_OUTPUT_FILENAME = WAVE_OUTPUT_FILENAME
        self.q = Queue()
        self.lock = threading.Lock()
        self.stopRecordingEvent = None
        self.fe = FeatureExtractor.FeatureExtractor(BASELINE)
def search(request):
    if request.POST:
        # print(request.POST['q'])
        query = request.POST['q']
        # print(query)

        # 查询词加入搜索历史列表
        res['history'].append(query)

        # 进行查询推荐
        result = FeatureExtractor.do_query(query)

        # 获取推荐词和其对应的相似度
        similarity, recommend = zip(*result)
        # 取第TOP_K个推荐词的精度,若数量不足TOP_K个则取最后一个
        if len(similarity) >= FeatureExtractor.TOP_K:
            res['precise'] = similarity[FeatureExtractor.TOP_K - 1]
        else:
            res['precise'] = similarity[-1]
        res['sentiment'] = sentiment_analysis.getscore_recommend(recommend)
        res['recommend'] = recommend
    elif request.GET:
        pass
    return render(request, "search_post.html", res)
Example #38
0
def main():

    print "Loading previously stored HMM..."
    hmm = get_stored_hmm()
    #hmm = getHMM()
    print hmm.A
    print
    print hmm.B
    print hmm.pi
    print "HMM Created successfully"

    print "Loading previously stored RF model"
    RF = joblib.load('RFModel.pk')
    print "RF loaded successfully"
    
    # Feature Extractor
    FE = FeatureExtractor()
    FE.set_class(1)

    # Open socket and start listening for data
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind((HOST, PORT))
    s.listen(1)

    subactivities = get_subactivities()
    activities = get_activities()

    conn, addr = s.accept()
    while True:
        data = conn.recv(20000)
        #print data
        if data:
            #print data
            data = data.split('\n')

            # Remove extra newline
            del data[-1]

            csv_data = []
            for d in data:
                #print len(d)
                if len(d) >= 53:
                    d2 = d[14:].strip('"')
                    d3 = np.fromstring(d2, dtype=int, sep=',')
                    csv_data.append(d3.tolist()) 
            
            #print len(csv_data)
            if len(csv_data) > 0:
                csv_data = np.array(csv_data)
                #row,col = csv_data.shape
                #print row, col
                
                features = []
                for x in range(4):
                    f = FE.get_featurelist_from_nparray(csv_data, 2*x, 2*x + 1)
                    for y in f:
                        p = RF.predict(y[:25])[0]
                        if len(history[x]) >= HISTORY_LEN:
                            history[x].pop(0)
                            history_labels[x].pop(0)
                        history[x].append(p)
                        history_labels[x].append(subactivities[int(p)])
                
                for sub in subactivities:
                    print sub, get_subactivity_class(sub)
                

                for z in range(4):
                    hmm.mapB(history[z]) 
                    alpha = hmm.calculate_alpha(history[z])
                    #alpha_normalized = alpha
                    alpha_normalized = alpha.astype('float') / alpha.sum(axis=1)[:, np.newaxis]
                    
                    c = most_common(history[z])
                    
                    if len(history) > 12:
                        print z+1, history_labels[z][12:], subactivities[int(c)] 
                    else:
                        print z+1, history_labels[z][5:], subactivities[int(c)] 
                    print alpha_normalized[-1]
                    print

                print "\n--------------\n"
Example #39
0
from FeatureExtractor import *

'''
	run this script to perform feature extraction and store the vectors as files
'''

if __name__ == '__main__':

	ex = FeatureExtractor()

	ex.processAllDocs()
	print "Done processing documents"

	ex.build()
	ex.TFIDF()
	print "Done generating feature vectors"

	ex.saveAllVectors()
	ex.saveWordList()
	ex.saveFilenames()
	print "Features, wordList and filenames have been saved."
Example #40
0
    featureSetsToUse["statistics"] = True     # counting statistics
    featureSetsToUse["frequency"] = True      # frequency statistics
    featureSetsToUse["hapax"] = True          # hapax count


    wf = open("deleteme.txt", 'w')

    wf.writelines("Sentence   |||   proper noun percentage, word variance, prp, wordCount\n")
    wf.writelines("========================================================================\n\n")

    for data in positives:
        wf.writelines(data.chunk + "\n")
        # wf.writelines(str(data.features))

        # wf.writelines("\n")
        feats = FeatureExtractor.langFeatures(data, featureSetsToUse)

        for i in feats.keys():
            wf.writelines(i + " ")
            wf.writelines(str(feats[i]))
            wf.writelines("   ")

        wf.writelines("\n")

    wf.writelines("========================================================================\n")

    for data in negatives:
        wf.writelines(data.chunk + "\n")
        # wf.writelines(str(data.features))
        # wf.writelines("\n")
        feats = FeatureExtractor.langFeatures(data, featureSetsToUse)
Example #41
0
def main():

    # print "Loading previously stored HMM..."
    # hmm = get_stored_hmm()
    # #hmm = getHMM()
    # print hmm.A
    # print
    # print hmm.B
    # print hmm.pi
    # print "HMM Created successfully"
    tlist = []
    print "Loading previously stored RF model"
    RF = joblib.load('RFModel.pk')
    print "RF loaded successfully"

    # Feature Extractor
    FE = FeatureExtractor()
    FE.set_class(1)

    # Open socket and start listening for data
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind((HOST, PORT))
    s.listen(1)

    subactivities = get_subactivities()
    activities = get_activities()

    conn, addr = s.accept()
    while True:
        data = conn.recv(20000)
        #print data

        if data:
            start = time.clock()
            #print data
            data = data.split('\n')

            # Remove extra newline
            del data[-1]

            csv_data = []
            for d in data:
                #print len(d)
                if len(d) >= 53:
                    d2 = d[14:].strip('"')
                    d3 = np.fromstring(d2, dtype=int, sep=',')
                    csv_data.append(d3.tolist())

            #print len(csv_data)
            if len(csv_data) > 0:
                csv_data = np.array(csv_data)
                #row,col = csv_data.shape
                #print row, col

                features = []
                for x in range(4):
                    f = FE.get_featurelist_from_nparray(csv_data, 2*x+1, 2*x + 2)
                    for y in f:
                        row,col = y.shape
                        p = RF.predict(y[:col-2])[0]
                        if len(history[x]) >= HISTORY_LEN:
                            history[x].pop(0)
                            history_labels[x].pop(0)
                        history[x].append(p)
                        history_labels[x].append(subactivities[int(p)])

                for sub in subactivities:
                    print sub, get_subactivity_class(sub)

            elapsed = (time.clock() - start)
            tlist.append(elapsed)
            print tlist
def main() :
        print "\nFetching Emails...\n"
        #gui.interfaceFetchEmails()
        print "Extracting Features...\n"
        fe.extractFeatures()
Example #43
0
def main(args):
    print(args)
    train = load_from_tsv(args.train_file)
    dev = load_from_tsv(args.dev_file)
    test = load_from_tsv(args.test_file)
    print("ntrain: %d, ndev: %d, ntest: %d" % (len(train), len(dev), len(test)))

    classnames = list(set(map(lambda tweet: tweet.label, train)))
    
    window_size = 20

    print("computing features (train)")
    fx = FeatureExtractor(["word2vec"], word2vec_model=args.word2vec_model)
    train_y = []
    train_x = []
    for tw in train:
        label = classnames.index(tw.label)
        m1,m2 = fx.process_word2vec_noagg(tw, window_size)
        train_x.append(m1)
        train_y.append(label)
        if m2 is not None:
            train_x.append(m2)
            train_y.append(label)
    train_y = np.asarray(train_y)
    train_x = np.asarray(train_x)
    print("train_x: ", train_x.shape)

    print("computing features (dev)")
    dev_y = []
    dev_x = []
    for tw in dev:
        label = classnames.index(tw.label)
        m1,m2 = fx.process_word2vec_noagg(tw, window_size)
        dev_x.append(m1)
        dev_y.append(label)
        if m2 is not None:
            dev_x.append(m2)
            dev_y.append(label)
    dev_y = np.asarray(dev_y)
    dev_x = np.asarray(dev_x)

    print("computing features (test)")
    test_y = []
    test_x = []
    for tw in test:
        label = classnames.index(tw.label)
        m1,m2 = fx.process_word2vec_noagg(tw, window_size)
        test_x.append(m1)
        test_y.append(label)
        if m2 is not None:
            test_x.append(m2)
            test_y.append(label)
    test_y = np.asarray(test_y)
    test_x = np.asarray(test_x)
    print("done")

    nclasses = len(classnames)
    train_data = (train_x, train_y)
    dev_data = (dev_x, dev_y)
    test_data = (test_x, test_y)

    neural_net.train_cnn(train_data, dev_data, test_data, nclasses, window_size=window_size)
def main():
    ef.login()
    fe.extractFeatures()
Example #45
0
__author__ = 'hafiz'
import numpy as np
from FeatureExtractor import *
from drawfigure import *
from Filter.datafilter import *
feobj = FeatureExtractor()

def get_actual_velocity(frames):
    v = (50* frames * 3)/(3.14*58)
    return v

def check_fft(y):
    import matplotlib.pyplot as plt
    t = np.arange(len(y))
    sp = np.fft.fft(y)
    freq = np.fft.fftfreq(t.shape[-1])
    plt.plot(freq, sp.real, freq, sp.imag)

    plt.show()

def check_fft1(y):

    from pylab import plot, show, title, xlabel, ylabel, subplot
    from scipy import fft, arange


    import numpy as np
    import pylab as pl
    rate = 200.0
    t = np.arange(0, len(y), 1/rate)
    x = np.sin(2*np.pi*4*t) + np.sin(2*np.pi*7*t) + np.random.randn(len(t))*0.2
Example #46
0
def validate(doc, name):

	f= open(name,"w")

	frequencies0 = FeatureExtractor.frequency(doc[:2]) #frequency count smoothed by 1
	frequencies1 = FeatureExtractor.augmented_frequency(frequencies0) # augmented frequencies taking into account document size
	frequencies = FeatureExtractor.idf(frequencies1) # idfs
	total = frequencies["**prob**"]
	totals = sum(total)

	priors =[0.33, 0.33, 0.328] # based on number of documents


	a = ["C2","IKEA_EN","IKEA_IT"]
	correct = 0
	number = 0

	tpC2      = 0
	tpIKEA_EN = 0
	tpIKEA_IT = 0
	fpC2      = 0
	fpIKEA_EN = 0
	fpIKEA_IT = 0
	fnC2      = 0
	fnIKEA_EN = 0
	fnIKEA_IT = 0


	with open(doc[2],"r") as mefile:

		for line in mefile:
			lines = line.split('\t')
			ID = lines[1]
			words = lines[4].replace("<s>","").replace("</s>","").split(" ")
			pC2 = 0
			pIKEA_IT = 0
			pIKEA_EN = 0
			for word in words:
				
				if word in frequencies:
					pC2 += math.log((frequencies[word][0]))
					pIKEA_EN += math.log((frequencies[word][1]))
					pIKEA_IT += math.log((frequencies[word][2]))

				else:
					pC2 += math.log(0.5)
					pIKEA_EN += math.log(0.5)
					pIKEA_IT += math.log(0.5)
			
			b = [pC2+math.log(priors[0]),pIKEA_EN+math.log(priors[1]),pIKEA_IT+math.log(priors[2])]
			# other possibilities
			# d = [pC2,pIKEA_EN,pIKEA_IT] # without priors
			# c = [-pC2*priors[0],-pIKEA_EN*priors[1],-pIKEA_IT*priors[2]] # multiplying by priors
			
			

			proposal = a[b.index(max(b))]
			f.write(proposal + "\t" + ID + "\n")

			# calculate precision, recall, f1
			# count true positives, false positives, false negatives
			print proposal
			

			if ID == proposal:
				if ID == "C2":
					tpC2+=1
				elif ID == "IKEA_EN":
					tpIKEA_EN +=1
				elif ID == "IKEA_IT":
					tpIKEA_IT += 1
				correct += 1
			else:
				if ID == "C2":
					fnC2+=1
				elif ID == "IKEA_EN":
					fnIKEA_EN +=1
				elif ID == "IKEA_IT":
					fnIKEA_IT += 1
				if proposal == "C2":
					fpC2+=1
				elif proposal == "IKEA_EN":
					fpIKEA_EN +=1
				elif proposal == "IKEA_IT":
					fpIKEA_IT += 1

			number +=1

	print fnC2
	precisionC2 = tpC2 / ( tpC2 + fpC2 )
	precisionIKEA_IT = tpIKEA_IT / ( tpIKEA_IT + fpIKEA_IT)
	precisionIKEA_EN = tpIKEA_EN / ( tpIKEA_EN + fpIKEA_EN)
	precisions = [precisionC2, precisionIKEA_EN,precisionIKEA_IT]
	recallC2 = tpC2 / ( tpC2 + fnC2 )
	recallIKEA_IT = tpIKEA_IT / ( tpIKEA_IT + fnIKEA_IT)
	recallIKEA_EN = tpIKEA_EN / ( tpIKEA_EN + fnIKEA_EN)
	recalls = [recallC2,recallIKEA_EN,recallIKEA_IT]

	avgpre = sum(precisions)/3
	avgrec = sum(recalls)/3



	f.write("\n\ncorrect: " + str(correct) + "out of" + str(number))
	f.write("\nprecision: " + str(avgpre))
	f.write("\nrecall: " + str(avgrec))
	f.write("\nF1: " + str( 2* ((avgpre*avgrec) / (avgpre + avgrec)) ))

	f.close()
Example #47
0
__author__ = 'hafiz'
from FeatureExtractor import *
from RF import *
from subactivities import  *
import shutil

if __name__=='__main__':
    fnam = "collection7-30"

    activity_name = get_activities()
    fobj = FeatureExtractor()
    framelenghts=[200,500,1000,1500,2000,2500,3000,3500,4000,4500,5000]

    fp = open("Data/"+fnam+"/"+activity_name[0]+".csv", 'w')
    for fln in framelenghts:
        sys.argv = ["FeatureExtractor.py", "Data/"+fnam+"/train" ,"train"]

        stp = int(fln/2)
        fobj.set_frameslength(fln,stp)
        fobj.main()

        sys.argv= ["python FeatureExtractor.py", "Data/"+fnam+"/eval", "test"]
        fobj.main()

        rfObj = RFClassifier()
        # rfObj.main()
        r = str(fln)
        rs = []
        rs = rfObj.main_fusion()
        for a in rs:
            r=r+","+str(a)
def predictor(model,authorId,summary):
    featureExtractor = FeatureExtractor()
    featureExtractor.authorFinder(authorId)
    featureExtractor.similarityFinder(summary)
    return model.predict(featureExtractor.X)
Example #49
0
def splitFile(md, splitBySentence):
    rf = open(md.filename, "r")

    # skip first 6 lines since they aren't important
    talk = "\n".join(rf.readlines()[6:])

    # remove the Audio: Laughing and the applause
    talk = talk.replace("(Applause)", "")
    talk = talk.replace("(Audio: Laughing)", "")
    talk = talk.replace("-- (Laughter) --", " (Laughter) ")
    talk = talk.replace("-- (Laughter) ", " (Laughter) ")

    # remove hyphens for clarity
    talk = talk.replace("-", " ")

    # grab the frequency distribution
    talkFrequency = nltk.FreqDist(word_tokenize(talk.lower()))
    hapaxes = talkFrequency.hapaxes()
    hapaxes = [w for w in hapaxes if w.isalpha()]

    chunks = []
    if splitBySentence:
        chunks = sent_tokenize(talk)  # the talk turned into sentences
    else:
        chunks = talk.split("\n")  # the talk is split by paragraph

    passedChunks = []  # the pure (laughs removed) chunks passed
    passedWords = [["TS", "TS", "TS"]]  # prev chunks broken into stemmed/CC wds
    passedPOS = []  # passed chunks broken into POS
    numChunks = len(chunks)  # the number of chunks in the talk
    chunksSinceLastLaugh = 0  # the number of chunks since last laugh
    laughCount = 0  # the laughs counted so far
    positives = []  # all of the positives
    negatives = []  # all of the negatives
    previousSentiment = {"Polarity": 0}  # the previous chunk's sentiment

    for i in range(numChunks):
        # create a FeatureCollection for each chunk
        features = FeatureCollection.FeatureCollection(md.name)

        # if there is laughter in the chunk and it's not at the beginning OR
        # it is at the start of the next (if there is one) chunk
        if ("(Laughter)" in chunks[i][3:]) or (i != numChunks - 1 and ("(Laughter)" in chunks[i + 1][:12])):
            features.positive = True
        else:
            features.positive = False

        # remove the laughter from the chunk
        curChunk = chunks[i].replace("(Laughter)", "")
        features.chunk = curChunk

        # end here if there is nothing in the chunk besides laughter
        if len(curChunk) > 1:
            # increase the distance since the last laugh
            chunksSinceLastLaugh += 1

            features.features["depth"] = i / numChunks  # Depth
            features.features["laughsUntilNow"] = laughCount  # Laugh Count Before This
            features.features["chunksSinceLaugh"] = chunksSinceLastLaugh  # Chunks since lastlaugh

            # put this chunk at the end of the passed chunks list
            passedChunks.append(curChunk)

            # get sentiment
            features.sentimentFeats = FeatureExtractor.getSentiment(curChunk, previousSentiment)
            previousSentiment = features.sentimentFeats
            features.sentimentFeats["swearing"] = False

            # analyze sentence structure
            # (maxD, maxST, avgDepth, avgSubTrees) = getSubtreeFeatures(curChunk)
            # features["max_depth"] = maxD
            # features["max_subtree"] = maxST
            # features["avg_depth"] = avgDepth
            # features["avg_subtree"] = avgSubTrees

            # get Parts of speech features
            words = word_tokenize(curChunk)
            (_, pos) = FeatureExtractor.getPOS(words)
            passedPOS.append(pos)
            features.POS = pos

            # get length of chunk
            features.features["length"] = len(words)

            # replace quantites and years and some person names
            entity_chunk = entity_recognition(curChunk)
            features.features["statistic_count"] = entity_chunk.count("__Statistic__")

            # tokenize the words
            curChunk = curChunk.lower()
            words = word_tokenize(curChunk)
            talklen = len(words)

            # store the word vector
            if splitBySentence:
                features.wordVector = [[w for w in words if w not in stop and w.isalpha()]]
            else:
                for s in sent_tokenize(curChunk):
                    wordlist = [w for w in word_tokenize(s) if w not in stop and w.isalpha()]
                    features.wordVector.append(wordlist)

            # set the last 3 words
            features.prev3Words = passedWords[-1][-3:]

            # case collapse and stem words, get the variance, and hapax count
            variousWords = {}
            numHapax = 0
            for j in range(talklen):
                # move following line below if variance by stemmed words
                variousWords[words[j]] = True
                if words[j] in hapaxes:
                    numHapax += 1

                if words[j] in swears:
                    features.sentimentFeats["swearing"] = True
                    # words[j] = "SWEARWORD"     # seems to have lowered accuracy

                words[j] = stemmer.stem(words[j])

            features.features["hapax_count"] = numHapax

            # calculate word variance
            if talklen > 0:
                features.wordVariance = len(variousWords) / talklen
            else:
                features.wordVariance = 0

            # get words as features
            passedWords.append((words + ["EOS"]))
            features.words = [w for w in words if w not in stop]  # word_tokenize(curChunk)

            # if the chunk was positive thenwe need to:
            # reset the distance since last laugh and increment laugh count
            if features.positive:
                laughCount += 1
                chunksSinceLastLaugh = 0
                positives.append(features)
            else:
                negatives.append(features)

    rf.close

    return [positives, negatives]
Example:
    python BatchFeatureExtractor.py -i semcor/semcor3.0 -o semcor_features -f l-3_l+1_t-1_t+2_tw_twt''')

parser.add_argument('-i', '--input', help='Input file name', required=True)
parser.add_argument('-o', '--output', help='Output file name', required=True)
parser.add_argument('-f', '--features', help='Feature names', required=True)
args = parser.parse_args()

## show values ##
srcdir = args.input
srcdirpath = os.path.abspath(srcdir)

dstdir = args.output
dstdirpath = os.path.abspath(dstdir)

featurenames = args.features

retout = ""
if not os.path.exists(dstdirpath):
    os.makedirs(dstdirpath)
    if os.path.exists(srcdirpath):
        retout = __process__(srcdirpath)
    else:
        print("The source directory is not exist!")
else:
    print("The destination directory is already exist!")

print("Extracting the features ...")
FeatureExtractor.__featureextractor__(retout, dstdirpath, featurenames)
Example #51
0
from FeatureExtractor import *
from QClassifier import *
from numpy import asarray

if __name__ == '__main__':

    logging.basicConfig(level = logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='qclassifier.log',
                filemode='w')
    reload(sys)
    sys.setdefaultencoding('utf8')

    logging.info('start to extract features')
    extractor = FeatureExtractor()
    extractor.load(path = '../data/pair.xml')
    features = extractor.extract_features()
    labels = extractor.get_labels()
    assert(len(labels) == len(features))

    logging.info('split data into training data & test data')
    train_percentage = 0.8
    mid = int(len(features) * (1 - train_percentage))
    test_x, train_x = features[:mid], features[mid:]
    test_y, train_y = labels[:mid], labels[mid:]
    vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
    train_x = vectorizer.transform(train_x)
    test_x = vectorizer.transform(test_x)
    # train_x = vectorizer.transform(train_x)
    # test_x = vectorizer.transform(test_x)