def store_transformation(self, input_file, output_file, dict_file,
                             tfidf_file):
        """
        Apply the transformation model on the given hash documents.
        Store transformed 'features' in file.
        """

        self.check_model()

        utils.check_file_readable(dict_file)
        cdictionary = dictionary.load(dict_file)

        tfidf_model = None
        if self.name != 'LDA':
            utils.check_file_readable(tfidf_file)
            tfidf_model = self.TRANSFORMERS['TFIDF'].load(tfidf_file)

        sc = StreamCorpus(input_file)

        try:
            pc = PushCorpus(output_file)

            for doc in sc:
                if 'content' in doc and 'id' in doc:
                    doc['features'] = self.transform_doc(
                        cdictionary, tfidf_model, doc['content'], doc['id'])
                    pc.add(doc)
        except Exception as e:
            raise CaughtException(
                "Exception encountered when storing transformed documents: {}".
                format(e))
        else:
            self.logger.info("Stored {} documents to file".format(pc.size))
        finally:
            pc.close_stream()
Ejemplo n.º 2
0
    def preprocessed_file(self, category, subset, ptype):
        """
        Return the preprocessed data files
        for given category, subset and preprocessing
        """

        filename = self.paths['data'][category]['preprocessed'][ptype][subset]
        utils.check_file_readable(filename)
        return filename
Ejemplo n.º 3
0
    def __init__(self, input_file):
        """Initialize with the input filename"""

        super().__init__()

        if not isinstance(input_file, str):
            raise ConfigError(
                "Given parameter {} is not String".format(input_file))

        utils.check_file_readable(input_file)
        self.filename = input_file
    def __init__(self, data_files, categories):
        """Load classified data files and list of categories"""

        super().__init__()

        for filename in data_files:
            utils.check_file_readable(filename)

        self.stats = {}
        self.data_files = list(data_files)
        self.categories = list(categories)
Ejemplo n.º 5
0
    def __init__(self, data_files, categories):
        """Load classified data files and list of categories"""

        super().__init__()

        for filename in data_files:
            utils.check_file_readable(filename)

        self.logger.info("Evaluate predictions on {} files".format(data_files))

        self.data_files = list(data_files)
        self.categories = list(categories)
Ejemplo n.º 6
0
def read_dictionary(dict_file):
    """Read the word - word_id dictionary"""

    utils.check_file_readable(dict_file)

    word_dictionary = {}
    with open(dict_file, 'r') as stream:
        for line in stream:
            tokens = line.split()
            if len(tokens) == 3:
                word_id = int(tokens[0])
                word = str(tokens[1])
                word_dictionary[word_id] = word

    return word_dictionary
Ejemplo n.º 7
0
    def transformed_files(self, ttype, ptype, subset):
        """
        Return the list of transformed 'subset' data files for each category
        """

        files = []

        for category in self.classes:
            filename = self.transformed_file(category, subset, ttype, ptype)

            utils.check_file_readable(filename)
            files.append(filename)

        if not files:
            exit("Empty list of transformed files")

        return files
Ejemplo n.º 8
0
    def __init__(self, model_file):
        """Load classifier model from binary file"""

        super().__init__()

        utils.check_file_readable(model_file)

        self.model = None
        with open(model_file, 'rb') as icstream:
            try:
                self.model = pickle.load(icstream)
            except Exception as e:
                raise CaughtException(
                    "Exception encountered when loading the classifier: {}".
                    format(e))

        self.name = type(self.model).__name__
        self.categories = self.model.classes_

        self.logger.info("Loaded already-trained {} classifier model "
                         "from '{}' file".format(self.name, model_file))
Ejemplo n.º 9
0
    def __init__(self, input_files):
        """Initialize with the list of filenames"""

        super().__init__()

        if not isinstance(input_files, list):
            raise ConfigError('Given parameter is not a List')
        else:
            for filename in input_files:
                utils.check_file_readable(filename)

        # count the number of documents in each file
        self.ndocs = [count_file_lines(fn) for fn in input_files]
        self.logger.info("Available data for training: {}".format(self.ndocs))

        # create one generator for each input file
        # => return one document at a time from each input file
        self.generators = [loop_doc(filename) for filename in input_files]

        # where we stopped reading from files
        self.stop_index = 0
    def setUp(self):
        """Initialize test models"""

        cfolder = os.path.dirname(__file__)

        dict_file = os.path.join(cfolder, 'example_dictionary.bin')
        tfidf_file = os.path.join(cfolder, 'example_model_tfidf.bin')
        lsi_file = os.path.join(cfolder, 'example_model_lsi.bin')

        utils.check_file_readable(dict_file)
        utils.check_file_readable(tfidf_file)
        utils.check_file_readable(lsi_file)

        self.dictionary = dictionary.load(dict_file)
        self.tfidf_model = LoadTransformer('TFIDF', tfidf_file).model
        self.transformer = LoadTransformer('LSI', lsi_file)

        self.doc = 'le paris saint germain a tenté de faire venir fernando  '\
            "torres au tout début de l' été selon des informations révélées "\
            "par marca ce dimanche alors en fin de contrat avec l' atlético "\
            "de madrid l' attaquant espagnol a dîné en compagnie de ses "\
            "agents et d' olivier létang le directeur sportif adjoint "\
            "parisien dans un restaurant madrilène à quelques jours de la "\
            "finale de la ligue des champions perdue face au real madrid t "\
            "a b à laurent blanc alors l' entraîneur du psg avait validé "\
            "son profil et comptait bien lui accorder un temps de jeu "\
            "conséquent sauf qu' entre temps torres a prolongé son contrat "\
            'avec son club de cœur et que blanc a été viré '\
            "de son poste d' entraîneur"
    def __init__(self, model_name, model_file):
        """Initialize the transformation model"""

        super().__init__()

        if model_name.upper() not in self.TRANSFORMERS:
            raise ConfigError("Unknown model name '{}'. Choose from {}".format(
                model_name, self.TRANSFORMERS.keys()))

        utils.check_file_readable(model_file)

        self.name = model_name.upper()
        self.model = self.TRANSFORMERS[self.name].load(model_file)

        self.ntopics = 0

        if hasattr(self.model, 'num_topics'):
            self.ntopics = self.model.num_topics

        if self.name == "LSI" \
                and self.ntopics != self.model.projection.u[0].size:
            self.ntopics = self.model.projection.u[0].size

        self.logger.info("Loaded {} transformation model".format(self.name))
Ejemplo n.º 12
0
    def store_prediction(self, input_file, output_file):
        """
        Test the classifier on 'untagged' documents.
        Store prediction category and prediction probability in file.
        """

        if not self.prediction_checkups():
            return

        utils.check_file_readable(input_file)
        utils.create_path(output_file)

        sc = StreamCorpus(input_file)

        try:
            pc = PushCorpus(output_file)

            for doc in sc:
                if 'features' in doc:
                    prediction = self.classify_doc(doc['features'])

                    if isinstance(prediction, dict) and \
                        'category' in prediction and \
                        'probas' in prediction:

                        doc['season'] = prediction['category']
                        doc['season_prob'] = prediction['probas']
                        pc.add(doc)
        except Exception as e:
            raise CaughtException(
                "Exception encountered when storing classified documents: {}".
                format(e))
        else:
            self.logger.info("Stored {} documents to file".format(pc.size))
        finally:
            pc.close_stream()
Ejemplo n.º 13
0
def load(input_file):
    """Load a gensim MmCorpus from binary file"""

    utils.check_file_readable(input_file)
    return MmCorpus(input_file)
Ejemplo n.º 14
0
    def __init__(self, input_files):
        self.files = list(input_files)

        for filename in self.files:
            utils.check_file_readable(filename)
Ejemplo n.º 15
0
def load(input_file=None):
    utils.check_file_readable(input_file)
    return Dictionary.load(input_file)
    def setUp(self):
        """Initialize test models"""

        lr_file = os.path.join(os.path.dirname(__file__), 'example_lr.bin')
        mlp_file = os.path.join(os.path.dirname(__file__), 'example_mlp.bin')

        utils.check_file_readable(lr_file)
        utils.check_file_readable(mlp_file)

        self.lr_classifier = LoadClassifier(lr_file)
        self.mlp_classifier = LoadClassifier(mlp_file)

        self.feat = [
            0.161531769201, 0.0140297826703, 0.0255560597156, -0.0378792749395,
            -0.0534373074091, -0.00442695702176, -0.0145726661416,
            -0.0135339214573, 0.00370849097259, -0.023365562023,
            -0.0204149539189, 0.00740013314176, 0.0011409333045,
            -0.0359891795518, -0.10528163057, -0.020820165763, 0.0260065298225,
            -0.0285843292604, -0.0242631624939, -0.0667152697512,
            -0.129731128191, -0.00250289160245, -0.0321873575339,
            0.0156709096722, 0.00522902539173, 0.0225510685233,
            0.0377300689389, -0.0623191020915, -0.00252699484615,
            -0.00546746110117, 0.00921293089226, -0.0197730708768,
            0.119248050222, -0.0337028539119, 0.0640832361864,
            0.00495561427469, 0.0632896271369, -0.0167445284842,
            0.0459508845981, 0.0146199750494, -0.00885269024362,
            -0.00106873626308, -0.00847909888575, -0.00814447199232,
            0.0431372137187, 0.0239704280962, 0.0132186271151, 0.0394858228613,
            -0.0376448610484, -0.0140965431805, -0.0231934611707,
            -0.0277701152849, -0.0198615694835, -0.0171543749744,
            -0.0450425705443, -0.0208902837069, 0.0129278180077,
            -0.00853653435838, 0.00636921362624, -0.0371416961916,
            0.00597606734547, 0.0192830041109, -0.0198882191064,
            0.0314912783499, 0.0297489053825, -0.00166879609153,
            -0.0274309679029, 0.0177512106212, -0.0143893391685,
            0.0217575502022, -0.00612024516371, -0.0192703073851,
            -0.0352074591222, 0.0402669671933, -0.0286486046708,
            0.00962997395818, -0.00603262005354, 0.0138339701008,
            0.00149504621782, -0.0191123277592, 0.045561398082,
            -0.0186188782724, 0.000849809608471, -0.00564146594277,
            0.00604086228272, -0.00632908533126, 0.0253858519915,
            0.0217586229684, -0.00109366749121, 0.0201549503405,
            -0.02454644563, 0.0237495595514, -0.000532335628175,
            -0.0162696114632, -0.0134469771657, -0.0253166383449,
            0.034601411305, 0.0498026591865, 0.0147863382594, 0.00386504623682,
            0.0390216596728, -0.00752600517262, -0.0056334421425,
            -0.00703149668339, 0.00373049639187, -0.0170113080794,
            0.031063424262, 0.0070188894023, 0.0343821575866, -0.0296663417,
            -0.0312533247171, 0.00228560238568, -0.0200869188374,
            -0.0465861845003, 0.0074148532855, 0.00778328908485,
            -0.00239365097105, 0.0211159963864, -0.0101738550143,
            0.0221153313227, 0.0120745140318, 0.0424117786748, 0.0165123207562,
            0.0253632881829, -0.00550136829826, -0.0046292213177,
            -0.00315092809438, 0.0301160712503, 0.0286312131224,
            -0.00819090713118, 0.0315111166998, -0.0521589858068,
            -0.0185275914154, -0.0358262298568, -0.00163053741214,
            -0.0287416500401, 0.0688113221837, -0.0023933144002,
            -0.00661116226021, -0.0542106744341, -0.042784808944,
            0.0173215712105, -0.0392196811015, 0.0207339212476,
            -0.045454334704, 0.0202154719455, 0.0148105214512,
            -0.0230338012693, -0.00534712562521, 0.00555274339065,
            0.0615223702941, -0.0200281754886, -0.00964783123172,
            -0.0201680715981, 0.0208189487588, 0.0628993663653,
            0.0416043175912, 0.0156599702638, -0.0322385656012,
            -0.00482255604814, 0.00254091812215, 0.0152901086879,
            0.0125741794363, 0.0232558694026, 0.00977020134127,
            -0.0398008707529, -0.0268964956381, -0.0527130463474,
            -0.0045785862603, -8.81968703034e-05, 0.0017808194297,
            0.0146972449831, -0.0137502902062, 0.0158736394279,
            -0.0144204195937, -0.0440793011077, 0.0375157611199,
            -0.0096092059652, 0.0218906905165, 0.0113600669209,
            -0.00797593406881, -0.0214742360207, -0.0616331922445,
            0.0487977233648, 0.0330232714136, -0.00752373395462,
            -0.0259014603795, 0.0103778013875, 0.0239906506747,
            0.0274886993092, 0.0506120174333, 0.00933604771703,
            0.0128722536389, -0.0386741243019, -0.0118985580667,
            0.0011901461201, -0.0113642014771, 0.0315076351745,
            -0.0281116305346, 0.00891916810051, -0.0147883250651,
            0.00791639698605, 0.00577264543299, 0.0299472318902,
            -0.0123017107691, 0.032010269977, 0.0192380685421, 0.0164582142479,
            -0.0396823525632, 0.0113088908103, 0.000822630160239,
            -0.0116380734661, 0.00471031440102, 0.00026584501753,
            0.0303592525458, -0.00817147089427, -0.0135122516922,
            -0.011605032986, 0.000867605642953, 0.00356259474523,
            -0.0235009916937, 0.0134360592212, 0.0341675841633,
            0.0037336383048, -0.0208792924897, 0.0075391599431,
            0.0224859524829, 0.0249201303198, -0.0146172814441,
            0.00406734321071, 0.0284210897642, -0.00847374310539,
            -0.0114720316915, 0.00500263694258, 0.01710240779, 0.0224897147971,
            -0.0165403397364, 0.0151036858467, 0.000461956295288,
            -0.0347643848776, -0.032390318598, 0.00450104543445,
            0.0306297994042, 0.0232547549099, 0.0366407833988,
            -0.0198361948654, 0.00208582107582, -0.00264045939376,
            -0.0184410263176, -0.00599621175053, 0.00565406205562,
            0.00757286425264, 0.00812967830282, -0.0256624787586,
            -0.0125366291289, -0.0216738674731, -0.00673287071219,
            0.00241703352636, -0.0251406132051, 0.00625283173327,
            -0.000327143380255, -0.00424268394253, 0.0432970311364,
            -0.00291893736096, -0.00374888213984, -0.0305832174626,
            0.018082182327, -0.0180131334066, -0.00272588309469,
            0.0281102383074, 0.0327907474723, 0.0484505299177,
            -0.0395410396888, -0.000570024697794, 0.0294096917174,
            -0.00845491482633, -0.00239607065153, -0.00302720417788,
            -0.0317094611294, -0.0197224314562, 0.0282222047311,
            0.0166029197893, -0.0213904366637, 0.0349518921965,
            0.0189417489144, -0.0135698765814, 0.0269692942335,
            -0.00598382574847, -5.3245989314e-05, 0.017780171752,
            -0.0215918489997, 0.00190475192889, 0.0269761121872,
            0.055998928314, -0.0320044967321, -0.015664074917, 0.0138504464738,
            0.0120902940124, 0.0156187886592, -0.00421459494215
        ]