def structured_edge_detection(im_file): bin_dir = os.path.join(config.bin_dir(), 'structured_edge_detection') output = os.path.join(mkdtemp(), 'output.jpg') run_cmd([os.path.join(bin_dir, 'edges'), '-i', im_file, '-o', output, '-m', os.path.join(bin_dir, 'model.yml')]) edges = cv2.imread(output, cv2.CV_LOAD_IMAGE_GRAYSCALE) return edges
def compute_descriptors_BOW(self): """ Computes descriptors using SURF and the BOW framework """ bin_dir = config.bin_dir() temp_dir = tempfile.mkdtemp() extract_filename = os.path.join(temp_dir, 'extract.xml') pca_filename = os.path.join(self.final_dir, 'pca.xml') projected_filename = os.path.join(temp_dir, 'projected.xml') vocab_filename = os.path.join(self.final_dir, 'vocab.xml') #consider removing unnecessary files and variables logger.info("extracting features") bovw_functions.feature_extract(bin_dir, self.infofile, extract_filename, self.params) logger.info("computing PCA") bovw_functions.pca_computation(bin_dir, extract_filename, self.params['pca_dimensions'], pca_filename) logger.info("projecting features") bovw_functions.pca_projection(bin_dir, extract_filename, pca_filename, projected_filename) logger.info("computing vocabulary") bovw_functions.vocab_kms(bin_dir, projected_filename, self.params['vocab_size'], vocab_filename) logger.info("computing spatial scene (SURF) descriptors") num_levels = self.scene_params['train_detector_params']['num_levels'] return compute_spatial_features_from_xml(extract_filename, projected_filename, vocab_filename, num_levels)
def start_ner_server(self): if self.server is not None: return ner_dir = os.path.join(config.bin_dir(), 'ner') cmd = ['java', '-mx1000m', '-cp', 'stanford-ner.jar', 'edu.stanford.nlp.ie.NERServer', '-loadClassifier', 'classifiers/english.conll.4class.caseless.distsim.crf.ser.gz', '-port', str(NER_PORT), '-outputFormat', 'inlineXML'] logger.info("Starting NER server process") self.server = subprocess.Popen(cmd, cwd=ner_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self._wait_for_server()
def face_subspace(self): bin_dir = config.bin_dir() run_cmd([ os.path.join(bin_dir, 'aff_face_subspace'), self.feature_file, self.subspace_file, '-n1', str(self.n1), '-n2', str(self.n2) ])
def predict_2step(self, temp_subspace_file, projected_file): op_labels_file = os.path.join(self.model_dir, 'op_labels.xml') bin_dir = config.bin_dir() run_cmd([ os.path.join(bin_dir, 'aff_face_subspace_predict_2step'), projected_file, temp_subspace_file, op_labels_file, str(self.nmodes2), str(self.topn), str(self.thresh) ]) os.unlink(temp_subspace_file) return op_labels_file
def start_server_process(self): cmd = [ 'java', '-Xmx4096m', '-jar', 'TopicModelFast.jar', self.model_files_dir, self.output_pipe, self.input_pipe ] topic_model_dir = os.path.join(config.bin_dir(), 'topic_model') logger.info("starting java server process") self.server = subprocess.Popen(cmd, cwd=topic_model_dir) logger.info("done starting java server process") logger.info("waiting for models to load") message = self.receive_message_from_server() assert message == ['READY'], message logger.info("done waiting for models to load")
def video_hist(video_path, video_cls = -1): """Calculate motion vector histogram of video""" histfile = scratch_path('.hist') with delete_file(histfile): video_path = os.path.abspath(video_path) params = dict(video = video_path, video_cls = video_cls, histfile = histfile, bin_dir = config.bin_dir()) run_cmd(FFMPEG_CMD, params) hist_str = file(histfile).read() hist = map(float, hist_str.strip().split(',')) # first col is label, last 2 cols are meant for (video_id, timestamp) # Should be ignored return hist[1:-2]
def start_server(cls): # check if server already up if cls._poll_server(): logger.info("Server already up") return topic_model_dir = os.path.join(config.bin_dir(), 'topic_model') cmd = [ 'java', '-Xmx4096m', '-jar', 'LDAServer.jar', '--port', str(cls.MALLET_SERVER_PORT) ] server_log = os.path.join(config.log_dir(), 'lda_server.log') log_handle = open(server_log, 'a') subprocess.Popen(cmd, cwd=topic_model_dir, stdout=log_handle, stderr=subprocess.STDOUT) cls._wait_for_server()
def subspace_project(self, test_features): projected_file = os.path.join(self.model_dir, 'proj.xml') # creating copy of subspace file since we over write the same file temp_subspace_file = os.path.join( self.model_dir, 'temp_subspace_file_%s.xml' % datetime.utcnow()) shutil.copy(self.subspace_file, temp_subspace_file) # create a feature file feature_file = os.path.join(self.model_dir, 'test_features.xml') la = np.asarray([[-1] * test_features.shape[0]]) cv_feats = np.transpose(np.concatenate( (la, test_features.transpose()))) mat_to_save = cv_feats.transpose().copy() cv.Save(feature_file, cv.fromarray(mat_to_save)) # call binary bin_dir = config.bin_dir() run_cmd([ os.path.join(bin_dir, 'aff_face_subspace_project'), feature_file, temp_subspace_file, projected_file ]) return temp_subspace_file, projected_file
def extract_text(self, audio_path, text_path): """Run speech model on a audio to generate transcript""" params = { 'bin_dir': config.bin_dir(), 'raw': audio_path, 'text': text_path, 'hmm': self.hmm, 'lm': self.lm, 'dic': self.dic, } try: run_cmd(TEXT_EXTRACT_CMD, params) except VideoProcessingError: # ASR occasionally fails with an error finding the start node or similar # In that case, save an empty file as transcript for the chunk # run_cmd already logged the stdout/ stderr from the failed proc logger.error("Failed running ASR on chunk for video %s", self.video) open(text_path, 'w').close()
def generate_transcript(self): """Process our video file and write its transcript to self.transcript_path""" self.grab_s3_files() clip_length = 10 duration = video_duration(self.video_path) chunks = int(math.ceil(duration / float(clip_length))) audio_path = os.path.join(self.base_dir, 'audio.raw') audio_clip_path = os.path.join(self.base_dir, 'clip.raw') text_clip_path = os.path.join(self.base_dir, 'clip.txt') logger.info("generating transcript") try: extract_audio(self.video_path, audio_path) except VideoProcessingError as e: if 'does not contain any stream' not in str(e): raise logger.error("Video %s has no audio stream", self.video) # Empty transcript because we haven no audio open(self.transcript_path, 'w').close() return for chunk in xrange(chunks): start = chunk * clip_length params = { 'infile': audio_path, 'outfile': audio_clip_path, 'secs': start, 'clip_length': clip_length, 'bin_dir': config.bin_dir(), } run_cmd(FFMPEG_EXTRACT_AUDIO_CLIP, params) self.extract_text(audio_clip_path, text_clip_path) self.append_to_transcript(text_clip_path) os.unlink(audio_clip_path) os.unlink(text_clip_path) logger.info("done generating transcript")
def run_aff_pca(self, features): '''call aff_face_pca binary for fast PCA computation in C''' # we need to add an extra id in the start since #aff_pca_expects it that way ones = np.zeros((features.shape[0], 1)) temp_feats = np.concatenate((ones, features), axis=1).\ transpose().copy() temp_dir = mkdtemp() feature_file = os.path.join(temp_dir, 'features.xml') cv_mat = cv.fromarray(temp_feats) cv.Save(feature_file, cv_mat) pca_file = os.path.join(temp_dir, 'learned_pca.xml') bin_path = config.bin_dir() cmd = [ os.path.join(bin_path, 'aff_face_pca'), feature_file, pca_file, '-n', str(self.ndims) ] run_cmd(cmd) return pca_file
from logging import getLogger from scipy.sparse import csr_matrix import numpy as np import os import tempfile from affine import config from affine.model import LdaDetector from ..topic_model import * from .lda_client import LdaClient logger = getLogger(__name__) INFER_LDA_JAR = os.path.join(config.bin_dir(), 'topic_model', 'InferLDA.jar') lda_model_lookup = {} def temp_path(): fd, path = tempfile.mkstemp() os.close(fd) return path def process_page(page, detectors): """Run lda detectors on webpage text""" logger.info("Running LDA detection on page %d", page.id) global lda_model_lookup lda_model_lookup = {} detectors = set(detectors) detector_ids_to_delete = set()
class TopicTrainer(object): MALLET_BIN = os.path.join(config.bin_dir(), 'topic_model', 'mallet') RGX_PAT = re.compile(ur'\w\w\w+', re.UNICODE) def __init__(self, config_dict): self.config_dict = config_dict self.n_pos_train = TopicTrainer.file_line_counter( config_dict['pos_train_json']) self.n_neg_train = TopicTrainer.file_line_counter( config_dict['neg_train_json']) self.n_pos_test = TopicTrainer.file_line_counter( config_dict['pos_test_json']) self.n_neg_test = TopicTrainer.file_line_counter( config_dict['neg_test_json']) @staticmethod def file_line_counter(infile): for i, _ in enumerate(open(infile)): continue return i + 1 @staticmethod def create_mallet_config_files(mallet_config_dict): h, mallet_config_file = mkstemp() os.close(h) config_obj = ConfigObj(mallet_config_dict) config_obj.filename = mallet_config_file config_obj.write() return mallet_config_file @staticmethod def write_mallet_input_file(pos_json, neg_json, outfile, vocab_set, include_related=False): """ Creates mallet compatible file from json file""" with open(outfile, 'w') as fo: for ll in open(pos_json): jsn = ll.strip() one_line = TopicTrainer.json_to_text_line(jsn, include_related) fo.write( TopicTrainer.preprocess_text(one_line, vocab_set).encode( 'utf-8') + '\n') for ll in open(neg_json): jsn = ll.strip() one_line = TopicTrainer.json_to_text_line(jsn, include_related) fo.write( TopicTrainer.preprocess_text(one_line, vocab_set).encode( 'utf-8') + '\n') def set_vocabulary(self): general_vocab = self.config_dict['general_vocab'] stop_file = self.config_dict['stop_file'] TopicTrainer.get_resource_file(general_vocab) TopicTrainer.get_resource_file(stop_file) self.vocab_set = set( open(general_vocab).read().decode('utf-8').splitlines()) self.stop_set = set( open(stop_file).read().decode('utf-8').splitlines()) freq_table = {} exc_vocab = self.vocab_set | self.stop_set # Add vocabulary from positive examples for new category for ll in open(self.config_dict['pos_train_json']): jsn = ll.strip() one_line = TopicTrainer.json_to_text_line( jsn, include_related=self.config_dict['include_related']) tokens = TopicTrainer.tokenize_text(one_line) for token in tokens: if token not in exc_vocab: freq_table[token] = freq_table.get(token, 0) + 1 for token in freq_table: if freq_table[token] > 1: self.vocab_set.add(token) @staticmethod def preprocess_text(one_line, vocab_set): tokens = TopicTrainer.tokenize_text(one_line) in_vocab = [w for w in tokens if w in vocab_set] return ' '.join(in_vocab) @staticmethod def tokenize_text(one_line): pst = nltk.PorterStemmer() all_tokens = TopicTrainer.RGX_PAT.findall(one_line.lower()) stemmed_tokens = [] for token in all_tokens: # only stem plurals if token.endswith('s'): token = pst.stem(token) # 3 letter words can be stemmed down if len(token) >= 3: stemmed_tokens.append(token) return stemmed_tokens @staticmethod def get_resource_file(resource_file): bucket = config.s3_detector_bucket() tarball_name = resource_file + '.tar.gz' s3client.download_from_s3(bucket, tarball_name, tarball_name) with tarfile.open(tarball_name, 'r:*') as tar: tar.extractall() assert os.path.isfile(resource_file) @staticmethod def json_to_text_line(jsn, include_related=False): yvt = YoutubeVideoText.to_object(jsn) if include_related: st = '\t'.join([ yvt.video_title, '%s' % yvt.video_description, ' '.join(['%s' % i for i in yvt.video_comments]), ' '.join(yvt.related_videos_text) ]) else: st = '\t'.join([ yvt.video_title, '%s' % yvt.video_description, ' '.join(['%s' % i for i in yvt.video_comments]) ]) return st def train_tm(self): logger.info('Setting vocabulary and stopwords') self.set_vocabulary() logger.info('Converting training json into mallet data') TopicTrainer.write_mallet_input_file( self.config_dict['pos_train_json'], self.config_dict['neg_train_json'], self.config_dict['mallet_import']['input'], self.vocab_set, include_related=self.config_dict['include_related']) TopicTrainer.mallet_import_and_train(self.config_dict) # write smaller pipe file logger.info('Creating vocab file') with open(self.config_dict['vocab_file'], 'w') as fo: for w in self.vocab_set: fo.write(w.encode('utf-8') + '\n') @staticmethod def mallet_import_and_train(config_dict): ''' generic method that takes a config dict as input and runs the LDA algorithm. The config dict must contain the keys 'mallet_import' and 'mallet_train' for the mallet specific import and training parameters ''' import_config_file = TopicTrainer.create_mallet_config_files( config_dict['mallet_import']) logger.info('Running mallet import') run_cmd([ TopicTrainer.MALLET_BIN, 'import-file', '--config', import_config_file ], timeout=None) train_config_file = TopicTrainer.create_mallet_config_files( config_dict['mallet_train']) logger.info('Running mallet training') run_cmd([ TopicTrainer.MALLET_BIN, 'train-topics', '--config', train_config_file ], timeout=None) logger.info('Done training topic models') os.unlink(import_config_file) os.unlink(train_config_file) # write smaller pipe file logger.info('Creating pipe file') h, tmp_file = mkstemp() os.close(h) run_cmd([ TopicTrainer.MALLET_BIN, 'import-file', '--input', tmp_file, '--use-pipe-from', config_dict['mallet_import']['output'], '--output', config_dict['pipe_file'] ], timeout=None) os.unlink(tmp_file) @staticmethod def doc_topics_to_libsvm(doc_topics_file, output_file, n_pos): ''' Requires that the first n_pos lines are positive examples''' fo = open(output_file, "w") fi = open(doc_topics_file) #skip header fi.readline() for lnum, l in enumerate(fi): ll = l.split() a = zip(ll[2::2], ll[3::2]) a.sort(key=lambda x: int(x[0])) b = [x[0] + ":" + x[1] for x in a] if lnum < n_pos: lbl = '1' else: lbl = '0' fo.write('%s %s\n' % (lbl, " ".join(b))) fi.close() fo.close() def train_classifier(self): libsvm_file = self.config_dict['classifier_params']['libsvm_file'] TopicTrainer.doc_topics_to_libsvm( self.config_dict['mallet_train']['output-doc-topics'], libsvm_file, self.n_pos_train) num_topics = self.config_dict['mallet_train']['num-topics'] x_train, y_train = load_svmlight_file(libsvm_file, num_topics, zero_based=True) bnb = BernoulliNB( binarize=self.config_dict['classifier_params']['bin_thresh']) logger.info('Training classifier') bnb.fit(x_train, y_train) pickle.dump( bnb, open(self.config_dict['classifier_params']['model_file'], "wb")) logger.info('Done training classifier') def check_model(self): h, mallet_input_file = mkstemp() os.close(h) logger.info('Converting testing json into mallet data') # Hold out data should never include related text TopicTrainer.write_mallet_input_file(self.config_dict['pos_test_json'], self.config_dict['neg_test_json'], mallet_input_file, self.vocab_set) pipe_file = self.config_dict['pipe_file'] output = self.config_dict['mallet_import']['output'] logger.info('Running mallet import') run_cmd([ TopicTrainer.MALLET_BIN, 'import-file', '--input', mallet_input_file, '--use-pipe-from', pipe_file, '--output', output ], timeout=None) h, output_doc_topics = mkstemp() os.close(h) infer_config_dict = {} infer_config_dict['output-doc-topics'] = output_doc_topics infer_config_dict['inferencer'] = self.config_dict['mallet_train'][ 'inferencer-filename'] infer_config_dict['doc-topics-max'] = self.config_dict['mallet_train'][ 'doc-topics-max'] infer_config_dict['input'] = output infer_config_file = TopicTrainer.create_mallet_config_files( infer_config_dict) logger.info('Running mallet inference') run_cmd([ TopicTrainer.MALLET_BIN, 'infer-topics', '--config', infer_config_file ], timeout=None) # Check prediction accuracy h, libsvm_file = mkstemp() os.close(h) num_topics = self.config_dict['mallet_train']['num-topics'] TopicTrainer.doc_topics_to_libsvm(output_doc_topics, libsvm_file, self.n_pos_test) x_test, _ = load_svmlight_file(libsvm_file, num_topics, zero_based=True) h, prediction_file = mkstemp() os.close(h) logger.info('Running classifier prediction') # manual matching if topic_thresholds provided if len(self.config_dict['topic_thresholds']): TopicTrainer.manual_prediction( x_test, self.config_dict['topic_thresholds'], prediction_file) self.write_model_stats(prediction_file, model_name='Manually matched') else: TopicTrainer.model_prediction( x_test, self.config_dict['classifier_params']['model_file'], prediction_file) self.write_model_stats(prediction_file) os.unlink(mallet_input_file) os.unlink(output_doc_topics) os.unlink(libsvm_file) os.unlink(prediction_file) @staticmethod def model_prediction(x_test, model_file, prediction_file): classifier = pickle.load(open(model_file, "rb")) y_pred = classifier.predict(x_test) with open(prediction_file, "w") as fo: for i in y_pred: fo.write('%s\n' % int(i)) @staticmethod def manual_prediction(x_test, topic_thresholds, prediction_file): y_pred = [] for xx in x_test: pred = 0 for t, v in topic_thresholds: if xx[0, t] >= v: pred = 1 break y_pred.append(pred) with open(prediction_file, "w") as fo: for i in y_pred: fo.write('%s\n' % int(i)) def write_model_stats(self, prediction_file, model_name='Naive Bayes'): tp, tn, fp, fn = TopicTrainer.get_acc_numbers(prediction_file, self.n_pos_test) precision = float(tp) / ((tp + fp) or 1) recall = float(tp) / ((tp + fn) or 1) with open(self.config_dict['model_stats'], 'w') as fo: fo.write('%s Model Stats\n' % model_name) fo.write('positive training docs = %d\n' % self.n_pos_train) fo.write('negative training docs = %d\n' % self.n_neg_train) fo.write('positive testing docs = %d\n' % self.n_pos_test) fo.write('negative testing docs = %d\n' % self.n_neg_test) fo.write('TPs, TNs, FPs, FNs = (%d, %d, %d, %d)\n' % (tp, tn, fp, fn)) fo.write('Precision = %f\n' % precision) fo.write('Recall = %f\n' % recall) @staticmethod def get_acc_numbers(prediction_file, n_pos): tp = tn = fp = fn = 0 # funny shape with open(prediction_file) as fi: for lnum, ll in enumerate(fi): p = int(ll.strip()) if lnum < n_pos: if p == 1: tp += 1 else: fn += 1 else: if p == 1: fp += 1 else: tn += 1 return tp, tn, fp, fn #shaken not stirred
def _run_extraction(self, infofile, extract_filename): cmd = FACE_EXTRACT_CMD % (config.bin_dir(), infofile.name, extract_filename) run_cmd(cmd)