Exemple #1
0
def classify_files():
    """
    The POST method allows API clients to use their model(s)
    The GET method allows API clients to get a list of available models.

    :return:
    """

    if request.method == "POST":
        # Log that we got a request
        app.logger.error("Got a file POST request")
        # Extract and validate credentials.
        app_id, app_key = check_credentials()

        try:
            submitted_file = request.files.get('file')
            ctype = submitted_file.content_type
            if ctype not in ACCEPTED_MIMETYPES:
                app.logger.error("Unsupported Media Type: %s", ctype)
                raise UnsupportedMediaType

            data = json.loads(request.data)
            if 'textcol' in data:
                textcol = data['textcol']
            validate_input_file(submitted_file, ctype)
            header, rows = sh.get_spreadsheet_rows(submitted_file,
                                                   textcol,
                                                   dedupe=True)

            model = data['model']
            ideanet = IDEANETS[model]
            predictions = ideanet.classify(rows)
            data = {"status": "success", "data": predictions}

        except:
            data = {"status": "failure", "data": {}}
Exemple #2
0
def classify_files():
    """
    The POST method allows API clients to use their model(s)
    The GET method allows API clients to get a list of available models.

    :return:
    """

    if request.method == "POST":
        # Log that we got a request
        app.logger.error("Got a file POST request")
        # Extract and validate credentials.
        app_id, app_key = check_credentials()

        try:
            submitted_file = request.files.get('file')
            ctype = submitted_file.content_type
            if ctype not in ACCEPTED_MIMETYPES:
                app.logger.error("Unsupported Media Type: %s", ctype)
                raise UnsupportedMediaType

            data = json.loads(request.data)
            if 'textcol' in data:
                textcol = data['textcol']
            validate_input_file(submitted_file, ctype)
            header, rows = sh.get_spreadsheet_rows(submitted_file,textcol,dedupe=True)

            model = data['model']
            ideanet = IDEANETS[model]
            predictions = ideanet.classify(rows)
            data = {"status":"success",
                    "data":predictions}

        except:
            data = {"status":"failure",
                    "data":{}}
    def preprocess(self):

        # For Synapsify Core output, the comments are in the first column
        #   and the sentiment is in the 6th column
        if self._raw_rows==None:
            header, rows = sh.get_spreadsheet_rows(os.path.join(self._data_directory, self._data_file), self._text_col, dedupe=self._dedupe)
        else:
            header = []
            rows = self._raw_rows

        sentences = [str(S[self._text_col]) for S in rows]
        classes   = [str(S[self._label_col]) for S in rows]
        self._DICTIONARY = self._build_dict(sentences)

        if self._class_type=="Sentiment":
            # Grab the indices for the Core sentiment
            self._train_xx = self._get_rand_sentiment_indices(classes, self._train_size,[])
            self._test_xx  = self._get_rand_sentiment_indices(classes, self._test_size, self._train_xx)
            self._trXX = self._get_sentiment_indices([rows[r] for r in self._train_xx], self._label_col, [])
            self._teXX = self._get_sentiment_indices([rows[r] for r in self._test_xx], self._label_col, self._train_xx)

            # Munge training and test sets for the classes provided
            train = self._munge_class_freqs(sentences,[self._trXX['neg'],self._trXX['pos']])
            test  = self._munge_class_freqs(sentences,[self._teXX['neg'],self._teXX['pos']])
        else:
            self._get_rand_indices(classes)
            # max_sentence_length(self.train_x_sets)

            # Munge training and test sets for the classes provided
            train_classes = []
            train_unique_classes = np.unique([int(key) for key in self._trXX.keys()]).tolist()
            for trc in train_unique_classes:
                train_classes.append(self._trXX[str(trc)])
            test_classes = []
            test_unique_classes = np.unique([int(key) for key in self._teXX.keys()]).tolist()
            for trc in test_unique_classes:
                test_classes.append(self._teXX[str(trc)])
            train = self._munge_class_freqs(sentences, train_classes)
            test  = self._munge_class_freqs(sentences, test_classes)


        # Split training into a validation set per the model parameter
        valid_set_x, valid_set_y, train_set_x, train_set_y = self._split_train_w_valid_set( train)

        # Remove unknown words
        train_set_x = self._remove_unk(train_set_x)
        valid_set_x = self._remove_unk(valid_set_x)
        test_set_x  = self._remove_unk(test[0])

        self.train_set = (train_set_x, train_set_y)
        self.valid_set = (valid_set_x, valid_set_y)
        self.test_set  = (test_set_x, test[1])

        # TVT = {
        #     'train': (train_set_x, train_set_y),
        #     'valid': (valid_set_x, valid_set_y),
        #     'test': (test_set_x,test[1])
        # }

        return self

# if __name__ == '__main__':
#     directory = sys.argv[1]
#     filename  = sys.argv[2]
#     textcol = 0
#     if len(sys.argv)>2: textcol = int(sys.argv[3])
#     sentcol = 5
#     if len(sys.argv)>3: sentcol = int(sys.argv[4])
#
#     ### I added the following part of codes here
#     train_size = int(sys.argv[5])
#     test_size = int(sys.argv[6])
#     ### The main function should take six parameters instead of four
#     ### The original code is:
#     ### main(directory, filename, textcol, sentcol)
#     load(directory, filename, textcol, sentcol, train_size, test_size)