Esempio n. 1
0
def structured_edge_detection(im_file):
    bin_dir = os.path.join(config.bin_dir(), 'structured_edge_detection')
    output = os.path.join(mkdtemp(), 'output.jpg')
    run_cmd([os.path.join(bin_dir, 'edges'), '-i', im_file, '-o',
             output, '-m', os.path.join(bin_dir, 'model.yml')])
    edges = cv2.imread(output, cv2.CV_LOAD_IMAGE_GRAYSCALE)
    return edges
 def compute_descriptors_BOW(self):
     """ Computes descriptors using SURF and the BOW framework
     """
     bin_dir = config.bin_dir()
     temp_dir = tempfile.mkdtemp()
     extract_filename = os.path.join(temp_dir, 'extract.xml')
     pca_filename = os.path.join(self.final_dir, 'pca.xml')
     projected_filename = os.path.join(temp_dir, 'projected.xml')
     vocab_filename = os.path.join(self.final_dir, 'vocab.xml')
     #consider removing unnecessary files and variables
     logger.info("extracting features")
     bovw_functions.feature_extract(bin_dir, self.infofile,
                                    extract_filename, self.params)
     logger.info("computing PCA")
     bovw_functions.pca_computation(bin_dir, extract_filename,
                                    self.params['pca_dimensions'],
                                    pca_filename)
     logger.info("projecting features")
     bovw_functions.pca_projection(bin_dir, extract_filename, pca_filename,
                                   projected_filename)
     logger.info("computing vocabulary")
     bovw_functions.vocab_kms(bin_dir, projected_filename,
                              self.params['vocab_size'], vocab_filename)
     logger.info("computing spatial scene (SURF) descriptors")
     num_levels = self.scene_params['train_detector_params']['num_levels']
     return compute_spatial_features_from_xml(extract_filename,
                                              projected_filename,
                                              vocab_filename, num_levels)
Esempio n. 3
0
 def start_ner_server(self):
     if self.server is not None:
         return
     ner_dir = os.path.join(config.bin_dir(), 'ner')
     cmd = ['java', '-mx1000m', '-cp', 'stanford-ner.jar', 'edu.stanford.nlp.ie.NERServer', '-loadClassifier', 'classifiers/english.conll.4class.caseless.distsim.crf.ser.gz', '-port', str(NER_PORT), '-outputFormat', 'inlineXML']
     logger.info("Starting NER server process")
     self.server = subprocess.Popen(cmd, cwd=ner_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     self._wait_for_server()
 def face_subspace(self):
     bin_dir = config.bin_dir()
     run_cmd([
         os.path.join(bin_dir, 'aff_face_subspace'), self.feature_file,
         self.subspace_file, '-n1',
         str(self.n1), '-n2',
         str(self.n2)
     ])
 def predict_2step(self, temp_subspace_file, projected_file):
     op_labels_file = os.path.join(self.model_dir, 'op_labels.xml')
     bin_dir = config.bin_dir()
     run_cmd([
         os.path.join(bin_dir, 'aff_face_subspace_predict_2step'),
         projected_file, temp_subspace_file, op_labels_file,
         str(self.nmodes2),
         str(self.topn),
         str(self.thresh)
     ])
     os.unlink(temp_subspace_file)
     return op_labels_file
Esempio n. 6
0
    def start_server_process(self):
        cmd = [
            'java', '-Xmx4096m', '-jar', 'TopicModelFast.jar',
            self.model_files_dir, self.output_pipe, self.input_pipe
        ]
        topic_model_dir = os.path.join(config.bin_dir(), 'topic_model')

        logger.info("starting java server process")
        self.server = subprocess.Popen(cmd, cwd=topic_model_dir)
        logger.info("done starting java server process")

        logger.info("waiting for models to load")
        message = self.receive_message_from_server()
        assert message == ['READY'], message
        logger.info("done waiting for models to load")
Esempio n. 7
0
def video_hist(video_path, video_cls = -1):
    """Calculate motion vector histogram of video"""
    histfile = scratch_path('.hist')
    with delete_file(histfile):
        video_path = os.path.abspath(video_path)
        params = dict(video = video_path,
                      video_cls = video_cls,
                      histfile = histfile,
                      bin_dir = config.bin_dir())
        run_cmd(FFMPEG_CMD, params)
        hist_str = file(histfile).read()
        hist = map(float, hist_str.strip().split(','))
        # first col is label, last 2 cols are meant for (video_id, timestamp)
        # Should be ignored
        return hist[1:-2]
    def start_server(cls):
        # check if server already up
        if cls._poll_server():
            logger.info("Server already up")
            return

        topic_model_dir = os.path.join(config.bin_dir(), 'topic_model')
        cmd = [
            'java', '-Xmx4096m', '-jar', 'LDAServer.jar', '--port',
            str(cls.MALLET_SERVER_PORT)
        ]
        server_log = os.path.join(config.log_dir(), 'lda_server.log')
        log_handle = open(server_log, 'a')
        subprocess.Popen(cmd,
                         cwd=topic_model_dir,
                         stdout=log_handle,
                         stderr=subprocess.STDOUT)
        cls._wait_for_server()
 def subspace_project(self, test_features):
     projected_file = os.path.join(self.model_dir, 'proj.xml')
     # creating copy of subspace file since we over write the same file
     temp_subspace_file = os.path.join(
         self.model_dir, 'temp_subspace_file_%s.xml' % datetime.utcnow())
     shutil.copy(self.subspace_file, temp_subspace_file)
     # create a feature file
     feature_file = os.path.join(self.model_dir, 'test_features.xml')
     la = np.asarray([[-1] * test_features.shape[0]])
     cv_feats = np.transpose(np.concatenate(
         (la, test_features.transpose())))
     mat_to_save = cv_feats.transpose().copy()
     cv.Save(feature_file, cv.fromarray(mat_to_save))
     # call binary
     bin_dir = config.bin_dir()
     run_cmd([
         os.path.join(bin_dir, 'aff_face_subspace_project'), feature_file,
         temp_subspace_file, projected_file
     ])
     return temp_subspace_file, projected_file
Esempio n. 10
0
    def extract_text(self, audio_path, text_path):
        """Run speech model on a audio to generate transcript"""
        params = {
            'bin_dir': config.bin_dir(),
            'raw': audio_path,
            'text': text_path,
            'hmm': self.hmm,
            'lm': self.lm,
            'dic': self.dic,
        }

        try:
            run_cmd(TEXT_EXTRACT_CMD, params)
        except VideoProcessingError:
            # ASR occasionally fails with an error finding the start node or similar
            # In that case, save an empty file as transcript for the chunk
            # run_cmd already logged the stdout/ stderr from the failed proc
            logger.error("Failed running ASR on chunk for video %s",
                         self.video)
            open(text_path, 'w').close()
Esempio n. 11
0
    def generate_transcript(self):
        """Process our video file and write its transcript to self.transcript_path"""
        self.grab_s3_files()

        clip_length = 10
        duration = video_duration(self.video_path)
        chunks = int(math.ceil(duration / float(clip_length)))

        audio_path = os.path.join(self.base_dir, 'audio.raw')
        audio_clip_path = os.path.join(self.base_dir, 'clip.raw')
        text_clip_path = os.path.join(self.base_dir, 'clip.txt')

        logger.info("generating transcript")

        try:
            extract_audio(self.video_path, audio_path)
        except VideoProcessingError as e:
            if 'does not contain any stream' not in str(e):
                raise
            logger.error("Video %s has no audio stream", self.video)
            # Empty transcript because we haven no audio
            open(self.transcript_path, 'w').close()
            return

        for chunk in xrange(chunks):
            start = chunk * clip_length
            params = {
                'infile': audio_path,
                'outfile': audio_clip_path,
                'secs': start,
                'clip_length': clip_length,
                'bin_dir': config.bin_dir(),
            }
            run_cmd(FFMPEG_EXTRACT_AUDIO_CLIP, params)

            self.extract_text(audio_clip_path, text_clip_path)
            self.append_to_transcript(text_clip_path)

            os.unlink(audio_clip_path)
            os.unlink(text_clip_path)
        logger.info("done generating transcript")
Esempio n. 12
0
    def run_aff_pca(self, features):
        '''call aff_face_pca binary for fast PCA computation in C'''
        # we need to add an extra id in the start since
        #aff_pca_expects it that way
        ones = np.zeros((features.shape[0], 1))
        temp_feats = np.concatenate((ones, features), axis=1).\
                transpose().copy()
        temp_dir = mkdtemp()
        feature_file = os.path.join(temp_dir, 'features.xml')
        cv_mat = cv.fromarray(temp_feats)
        cv.Save(feature_file, cv_mat)

        pca_file = os.path.join(temp_dir, 'learned_pca.xml')
        bin_path = config.bin_dir()
        cmd = [
            os.path.join(bin_path, 'aff_face_pca'), feature_file, pca_file,
            '-n',
            str(self.ndims)
        ]
        run_cmd(cmd)
        return pca_file
Esempio n. 13
0
from logging import getLogger
from scipy.sparse import csr_matrix
import numpy as np
import os
import tempfile

from affine import config
from affine.model import LdaDetector
from ..topic_model import *
from .lda_client import LdaClient

logger = getLogger(__name__)

INFER_LDA_JAR = os.path.join(config.bin_dir(), 'topic_model', 'InferLDA.jar')
lda_model_lookup = {}


def temp_path():
    fd, path = tempfile.mkstemp()
    os.close(fd)
    return path


def process_page(page, detectors):
    """Run lda detectors on webpage text"""
    logger.info("Running LDA detection on page %d", page.id)
    global lda_model_lookup
    lda_model_lookup = {}

    detectors = set(detectors)
    detector_ids_to_delete = set()
Esempio n. 14
0
class TopicTrainer(object):

    MALLET_BIN = os.path.join(config.bin_dir(), 'topic_model', 'mallet')
    RGX_PAT = re.compile(ur'\w\w\w+', re.UNICODE)

    def __init__(self, config_dict):
        self.config_dict = config_dict
        self.n_pos_train = TopicTrainer.file_line_counter(
            config_dict['pos_train_json'])
        self.n_neg_train = TopicTrainer.file_line_counter(
            config_dict['neg_train_json'])
        self.n_pos_test = TopicTrainer.file_line_counter(
            config_dict['pos_test_json'])
        self.n_neg_test = TopicTrainer.file_line_counter(
            config_dict['neg_test_json'])

    @staticmethod
    def file_line_counter(infile):
        for i, _ in enumerate(open(infile)):
            continue
        return i + 1

    @staticmethod
    def create_mallet_config_files(mallet_config_dict):
        h, mallet_config_file = mkstemp()
        os.close(h)
        config_obj = ConfigObj(mallet_config_dict)
        config_obj.filename = mallet_config_file
        config_obj.write()
        return mallet_config_file

    @staticmethod
    def write_mallet_input_file(pos_json,
                                neg_json,
                                outfile,
                                vocab_set,
                                include_related=False):
        """ Creates mallet compatible file from json file"""
        with open(outfile, 'w') as fo:
            for ll in open(pos_json):
                jsn = ll.strip()
                one_line = TopicTrainer.json_to_text_line(jsn, include_related)
                fo.write(
                    TopicTrainer.preprocess_text(one_line, vocab_set).encode(
                        'utf-8') + '\n')
            for ll in open(neg_json):
                jsn = ll.strip()
                one_line = TopicTrainer.json_to_text_line(jsn, include_related)
                fo.write(
                    TopicTrainer.preprocess_text(one_line, vocab_set).encode(
                        'utf-8') + '\n')

    def set_vocabulary(self):
        general_vocab = self.config_dict['general_vocab']
        stop_file = self.config_dict['stop_file']
        TopicTrainer.get_resource_file(general_vocab)
        TopicTrainer.get_resource_file(stop_file)
        self.vocab_set = set(
            open(general_vocab).read().decode('utf-8').splitlines())
        self.stop_set = set(
            open(stop_file).read().decode('utf-8').splitlines())

        freq_table = {}
        exc_vocab = self.vocab_set | self.stop_set
        # Add vocabulary from positive examples for new category
        for ll in open(self.config_dict['pos_train_json']):
            jsn = ll.strip()
            one_line = TopicTrainer.json_to_text_line(
                jsn, include_related=self.config_dict['include_related'])
            tokens = TopicTrainer.tokenize_text(one_line)
            for token in tokens:
                if token not in exc_vocab:
                    freq_table[token] = freq_table.get(token, 0) + 1
        for token in freq_table:
            if freq_table[token] > 1:
                self.vocab_set.add(token)

    @staticmethod
    def preprocess_text(one_line, vocab_set):
        tokens = TopicTrainer.tokenize_text(one_line)
        in_vocab = [w for w in tokens if w in vocab_set]
        return ' '.join(in_vocab)

    @staticmethod
    def tokenize_text(one_line):
        pst = nltk.PorterStemmer()
        all_tokens = TopicTrainer.RGX_PAT.findall(one_line.lower())
        stemmed_tokens = []
        for token in all_tokens:
            # only stem plurals
            if token.endswith('s'):
                token = pst.stem(token)
            # 3 letter words can be stemmed down
            if len(token) >= 3:
                stemmed_tokens.append(token)
        return stemmed_tokens

    @staticmethod
    def get_resource_file(resource_file):
        bucket = config.s3_detector_bucket()
        tarball_name = resource_file + '.tar.gz'
        s3client.download_from_s3(bucket, tarball_name, tarball_name)
        with tarfile.open(tarball_name, 'r:*') as tar:
            tar.extractall()
        assert os.path.isfile(resource_file)

    @staticmethod
    def json_to_text_line(jsn, include_related=False):
        yvt = YoutubeVideoText.to_object(jsn)
        if include_related:
            st = '\t'.join([
                yvt.video_title,
                '%s' % yvt.video_description,
                ' '.join(['%s' % i for i in yvt.video_comments]),
                ' '.join(yvt.related_videos_text)
            ])
        else:
            st = '\t'.join([
                yvt.video_title,
                '%s' % yvt.video_description,
                ' '.join(['%s' % i for i in yvt.video_comments])
            ])
        return st

    def train_tm(self):
        logger.info('Setting vocabulary and stopwords')
        self.set_vocabulary()
        logger.info('Converting training json into mallet data')
        TopicTrainer.write_mallet_input_file(
            self.config_dict['pos_train_json'],
            self.config_dict['neg_train_json'],
            self.config_dict['mallet_import']['input'],
            self.vocab_set,
            include_related=self.config_dict['include_related'])
        TopicTrainer.mallet_import_and_train(self.config_dict)
        # write smaller pipe file
        logger.info('Creating vocab file')
        with open(self.config_dict['vocab_file'], 'w') as fo:
            for w in self.vocab_set:
                fo.write(w.encode('utf-8') + '\n')

    @staticmethod
    def mallet_import_and_train(config_dict):
        ''' generic method that takes a config dict as input and runs the LDA algorithm.
        The config dict must contain the keys 'mallet_import' and 'mallet_train'
        for the mallet specific import and training parameters
        '''
        import_config_file = TopicTrainer.create_mallet_config_files(
            config_dict['mallet_import'])
        logger.info('Running mallet import')
        run_cmd([
            TopicTrainer.MALLET_BIN, 'import-file', '--config',
            import_config_file
        ],
                timeout=None)
        train_config_file = TopicTrainer.create_mallet_config_files(
            config_dict['mallet_train'])
        logger.info('Running mallet training')
        run_cmd([
            TopicTrainer.MALLET_BIN, 'train-topics', '--config',
            train_config_file
        ],
                timeout=None)
        logger.info('Done training topic models')
        os.unlink(import_config_file)
        os.unlink(train_config_file)
        # write smaller pipe file
        logger.info('Creating pipe file')
        h, tmp_file = mkstemp()
        os.close(h)
        run_cmd([
            TopicTrainer.MALLET_BIN, 'import-file', '--input', tmp_file,
            '--use-pipe-from', config_dict['mallet_import']['output'],
            '--output', config_dict['pipe_file']
        ],
                timeout=None)
        os.unlink(tmp_file)

    @staticmethod
    def doc_topics_to_libsvm(doc_topics_file, output_file, n_pos):
        ''' Requires that the first n_pos lines are positive examples'''
        fo = open(output_file, "w")
        fi = open(doc_topics_file)
        #skip header
        fi.readline()
        for lnum, l in enumerate(fi):
            ll = l.split()
            a = zip(ll[2::2], ll[3::2])
            a.sort(key=lambda x: int(x[0]))
            b = [x[0] + ":" + x[1] for x in a]
            if lnum < n_pos:
                lbl = '1'
            else:
                lbl = '0'
            fo.write('%s %s\n' % (lbl, " ".join(b)))
        fi.close()
        fo.close()

    def train_classifier(self):
        libsvm_file = self.config_dict['classifier_params']['libsvm_file']
        TopicTrainer.doc_topics_to_libsvm(
            self.config_dict['mallet_train']['output-doc-topics'], libsvm_file,
            self.n_pos_train)
        num_topics = self.config_dict['mallet_train']['num-topics']
        x_train, y_train = load_svmlight_file(libsvm_file,
                                              num_topics,
                                              zero_based=True)
        bnb = BernoulliNB(
            binarize=self.config_dict['classifier_params']['bin_thresh'])
        logger.info('Training classifier')
        bnb.fit(x_train, y_train)
        pickle.dump(
            bnb, open(self.config_dict['classifier_params']['model_file'],
                      "wb"))
        logger.info('Done training classifier')

    def check_model(self):
        h, mallet_input_file = mkstemp()
        os.close(h)
        logger.info('Converting testing json into mallet data')
        # Hold out data should never include related text
        TopicTrainer.write_mallet_input_file(self.config_dict['pos_test_json'],
                                             self.config_dict['neg_test_json'],
                                             mallet_input_file, self.vocab_set)
        pipe_file = self.config_dict['pipe_file']
        output = self.config_dict['mallet_import']['output']
        logger.info('Running mallet import')
        run_cmd([
            TopicTrainer.MALLET_BIN, 'import-file', '--input',
            mallet_input_file, '--use-pipe-from', pipe_file, '--output', output
        ],
                timeout=None)

        h, output_doc_topics = mkstemp()
        os.close(h)
        infer_config_dict = {}
        infer_config_dict['output-doc-topics'] = output_doc_topics
        infer_config_dict['inferencer'] = self.config_dict['mallet_train'][
            'inferencer-filename']
        infer_config_dict['doc-topics-max'] = self.config_dict['mallet_train'][
            'doc-topics-max']
        infer_config_dict['input'] = output

        infer_config_file = TopicTrainer.create_mallet_config_files(
            infer_config_dict)
        logger.info('Running mallet inference')
        run_cmd([
            TopicTrainer.MALLET_BIN, 'infer-topics', '--config',
            infer_config_file
        ],
                timeout=None)

        # Check prediction accuracy
        h, libsvm_file = mkstemp()
        os.close(h)
        num_topics = self.config_dict['mallet_train']['num-topics']
        TopicTrainer.doc_topics_to_libsvm(output_doc_topics, libsvm_file,
                                          self.n_pos_test)
        x_test, _ = load_svmlight_file(libsvm_file,
                                       num_topics,
                                       zero_based=True)
        h, prediction_file = mkstemp()
        os.close(h)
        logger.info('Running classifier prediction')
        # manual matching if topic_thresholds provided
        if len(self.config_dict['topic_thresholds']):
            TopicTrainer.manual_prediction(
                x_test, self.config_dict['topic_thresholds'], prediction_file)
            self.write_model_stats(prediction_file,
                                   model_name='Manually matched')
        else:
            TopicTrainer.model_prediction(
                x_test, self.config_dict['classifier_params']['model_file'],
                prediction_file)
            self.write_model_stats(prediction_file)

        os.unlink(mallet_input_file)
        os.unlink(output_doc_topics)
        os.unlink(libsvm_file)
        os.unlink(prediction_file)

    @staticmethod
    def model_prediction(x_test, model_file, prediction_file):
        classifier = pickle.load(open(model_file, "rb"))
        y_pred = classifier.predict(x_test)
        with open(prediction_file, "w") as fo:
            for i in y_pred:
                fo.write('%s\n' % int(i))

    @staticmethod
    def manual_prediction(x_test, topic_thresholds, prediction_file):
        y_pred = []
        for xx in x_test:
            pred = 0
            for t, v in topic_thresholds:
                if xx[0, t] >= v:
                    pred = 1
                    break
            y_pred.append(pred)
        with open(prediction_file, "w") as fo:
            for i in y_pred:
                fo.write('%s\n' % int(i))

    def write_model_stats(self, prediction_file, model_name='Naive Bayes'):
        tp, tn, fp, fn = TopicTrainer.get_acc_numbers(prediction_file,
                                                      self.n_pos_test)
        precision = float(tp) / ((tp + fp) or 1)
        recall = float(tp) / ((tp + fn) or 1)
        with open(self.config_dict['model_stats'], 'w') as fo:
            fo.write('%s Model Stats\n' % model_name)
            fo.write('positive training docs = %d\n' % self.n_pos_train)
            fo.write('negative training docs = %d\n' % self.n_neg_train)
            fo.write('positive testing docs = %d\n' % self.n_pos_test)
            fo.write('negative testing docs = %d\n' % self.n_neg_test)
            fo.write('TPs, TNs, FPs, FNs = (%d, %d, %d, %d)\n' %
                     (tp, tn, fp, fn))
            fo.write('Precision = %f\n' % precision)
            fo.write('Recall = %f\n' % recall)

    @staticmethod
    def get_acc_numbers(prediction_file, n_pos):
        tp = tn = fp = fn = 0  # funny shape
        with open(prediction_file) as fi:
            for lnum, ll in enumerate(fi):
                p = int(ll.strip())
                if lnum < n_pos:
                    if p == 1:
                        tp += 1
                    else:
                        fn += 1
                else:
                    if p == 1:
                        fp += 1
                    else:
                        tn += 1
        return tp, tn, fp, fn  #shaken not stirred
Esempio n. 15
0
 def _run_extraction(self, infofile, extract_filename):
     cmd = FACE_EXTRACT_CMD % (config.bin_dir(), infofile.name,
                               extract_filename)
     run_cmd(cmd)