def store_transformation(self, input_file, output_file, dict_file, tfidf_file): """ Apply the transformation model on the given hash documents. Store transformed 'features' in file. """ self.check_model() utils.check_file_readable(dict_file) cdictionary = dictionary.load(dict_file) tfidf_model = None if self.name != 'LDA': utils.check_file_readable(tfidf_file) tfidf_model = self.TRANSFORMERS['TFIDF'].load(tfidf_file) sc = StreamCorpus(input_file) try: pc = PushCorpus(output_file) for doc in sc: if 'content' in doc and 'id' in doc: doc['features'] = self.transform_doc( cdictionary, tfidf_model, doc['content'], doc['id']) pc.add(doc) except Exception as e: raise CaughtException( "Exception encountered when storing transformed documents: {}". format(e)) else: self.logger.info("Stored {} documents to file".format(pc.size)) finally: pc.close_stream()
def preprocessed_file(self, category, subset, ptype): """ Return the preprocessed data files for given category, subset and preprocessing """ filename = self.paths['data'][category]['preprocessed'][ptype][subset] utils.check_file_readable(filename) return filename
def __init__(self, input_file): """Initialize with the input filename""" super().__init__() if not isinstance(input_file, str): raise ConfigError( "Given parameter {} is not String".format(input_file)) utils.check_file_readable(input_file) self.filename = input_file
def __init__(self, data_files, categories): """Load classified data files and list of categories""" super().__init__() for filename in data_files: utils.check_file_readable(filename) self.stats = {} self.data_files = list(data_files) self.categories = list(categories)
def __init__(self, data_files, categories): """Load classified data files and list of categories""" super().__init__() for filename in data_files: utils.check_file_readable(filename) self.logger.info("Evaluate predictions on {} files".format(data_files)) self.data_files = list(data_files) self.categories = list(categories)
def read_dictionary(dict_file): """Read the word - word_id dictionary""" utils.check_file_readable(dict_file) word_dictionary = {} with open(dict_file, 'r') as stream: for line in stream: tokens = line.split() if len(tokens) == 3: word_id = int(tokens[0]) word = str(tokens[1]) word_dictionary[word_id] = word return word_dictionary
def transformed_files(self, ttype, ptype, subset): """ Return the list of transformed 'subset' data files for each category """ files = [] for category in self.classes: filename = self.transformed_file(category, subset, ttype, ptype) utils.check_file_readable(filename) files.append(filename) if not files: exit("Empty list of transformed files") return files
def __init__(self, model_file): """Load classifier model from binary file""" super().__init__() utils.check_file_readable(model_file) self.model = None with open(model_file, 'rb') as icstream: try: self.model = pickle.load(icstream) except Exception as e: raise CaughtException( "Exception encountered when loading the classifier: {}". format(e)) self.name = type(self.model).__name__ self.categories = self.model.classes_ self.logger.info("Loaded already-trained {} classifier model " "from '{}' file".format(self.name, model_file))
def __init__(self, input_files): """Initialize with the list of filenames""" super().__init__() if not isinstance(input_files, list): raise ConfigError('Given parameter is not a List') else: for filename in input_files: utils.check_file_readable(filename) # count the number of documents in each file self.ndocs = [count_file_lines(fn) for fn in input_files] self.logger.info("Available data for training: {}".format(self.ndocs)) # create one generator for each input file # => return one document at a time from each input file self.generators = [loop_doc(filename) for filename in input_files] # where we stopped reading from files self.stop_index = 0
def setUp(self): """Initialize test models""" cfolder = os.path.dirname(__file__) dict_file = os.path.join(cfolder, 'example_dictionary.bin') tfidf_file = os.path.join(cfolder, 'example_model_tfidf.bin') lsi_file = os.path.join(cfolder, 'example_model_lsi.bin') utils.check_file_readable(dict_file) utils.check_file_readable(tfidf_file) utils.check_file_readable(lsi_file) self.dictionary = dictionary.load(dict_file) self.tfidf_model = LoadTransformer('TFIDF', tfidf_file).model self.transformer = LoadTransformer('LSI', lsi_file) self.doc = 'le paris saint germain a tenté de faire venir fernando '\ "torres au tout début de l' été selon des informations révélées "\ "par marca ce dimanche alors en fin de contrat avec l' atlético "\ "de madrid l' attaquant espagnol a dîné en compagnie de ses "\ "agents et d' olivier létang le directeur sportif adjoint "\ "parisien dans un restaurant madrilène à quelques jours de la "\ "finale de la ligue des champions perdue face au real madrid t "\ "a b à laurent blanc alors l' entraîneur du psg avait validé "\ "son profil et comptait bien lui accorder un temps de jeu "\ "conséquent sauf qu' entre temps torres a prolongé son contrat "\ 'avec son club de cœur et que blanc a été viré '\ "de son poste d' entraîneur"
def __init__(self, model_name, model_file): """Initialize the transformation model""" super().__init__() if model_name.upper() not in self.TRANSFORMERS: raise ConfigError("Unknown model name '{}'. Choose from {}".format( model_name, self.TRANSFORMERS.keys())) utils.check_file_readable(model_file) self.name = model_name.upper() self.model = self.TRANSFORMERS[self.name].load(model_file) self.ntopics = 0 if hasattr(self.model, 'num_topics'): self.ntopics = self.model.num_topics if self.name == "LSI" \ and self.ntopics != self.model.projection.u[0].size: self.ntopics = self.model.projection.u[0].size self.logger.info("Loaded {} transformation model".format(self.name))
def store_prediction(self, input_file, output_file): """ Test the classifier on 'untagged' documents. Store prediction category and prediction probability in file. """ if not self.prediction_checkups(): return utils.check_file_readable(input_file) utils.create_path(output_file) sc = StreamCorpus(input_file) try: pc = PushCorpus(output_file) for doc in sc: if 'features' in doc: prediction = self.classify_doc(doc['features']) if isinstance(prediction, dict) and \ 'category' in prediction and \ 'probas' in prediction: doc['season'] = prediction['category'] doc['season_prob'] = prediction['probas'] pc.add(doc) except Exception as e: raise CaughtException( "Exception encountered when storing classified documents: {}". format(e)) else: self.logger.info("Stored {} documents to file".format(pc.size)) finally: pc.close_stream()
def load(input_file): """Load a gensim MmCorpus from binary file""" utils.check_file_readable(input_file) return MmCorpus(input_file)
def __init__(self, input_files): self.files = list(input_files) for filename in self.files: utils.check_file_readable(filename)
def load(input_file=None): utils.check_file_readable(input_file) return Dictionary.load(input_file)
def setUp(self): """Initialize test models""" lr_file = os.path.join(os.path.dirname(__file__), 'example_lr.bin') mlp_file = os.path.join(os.path.dirname(__file__), 'example_mlp.bin') utils.check_file_readable(lr_file) utils.check_file_readable(mlp_file) self.lr_classifier = LoadClassifier(lr_file) self.mlp_classifier = LoadClassifier(mlp_file) self.feat = [ 0.161531769201, 0.0140297826703, 0.0255560597156, -0.0378792749395, -0.0534373074091, -0.00442695702176, -0.0145726661416, -0.0135339214573, 0.00370849097259, -0.023365562023, -0.0204149539189, 0.00740013314176, 0.0011409333045, -0.0359891795518, -0.10528163057, -0.020820165763, 0.0260065298225, -0.0285843292604, -0.0242631624939, -0.0667152697512, -0.129731128191, -0.00250289160245, -0.0321873575339, 0.0156709096722, 0.00522902539173, 0.0225510685233, 0.0377300689389, -0.0623191020915, -0.00252699484615, -0.00546746110117, 0.00921293089226, -0.0197730708768, 0.119248050222, -0.0337028539119, 0.0640832361864, 0.00495561427469, 0.0632896271369, -0.0167445284842, 0.0459508845981, 0.0146199750494, -0.00885269024362, -0.00106873626308, -0.00847909888575, -0.00814447199232, 0.0431372137187, 0.0239704280962, 0.0132186271151, 0.0394858228613, -0.0376448610484, -0.0140965431805, -0.0231934611707, -0.0277701152849, -0.0198615694835, -0.0171543749744, -0.0450425705443, -0.0208902837069, 0.0129278180077, -0.00853653435838, 0.00636921362624, -0.0371416961916, 0.00597606734547, 0.0192830041109, -0.0198882191064, 0.0314912783499, 0.0297489053825, -0.00166879609153, -0.0274309679029, 0.0177512106212, -0.0143893391685, 0.0217575502022, -0.00612024516371, -0.0192703073851, -0.0352074591222, 0.0402669671933, -0.0286486046708, 0.00962997395818, -0.00603262005354, 0.0138339701008, 0.00149504621782, -0.0191123277592, 0.045561398082, -0.0186188782724, 0.000849809608471, -0.00564146594277, 0.00604086228272, -0.00632908533126, 0.0253858519915, 0.0217586229684, -0.00109366749121, 0.0201549503405, -0.02454644563, 0.0237495595514, -0.000532335628175, -0.0162696114632, -0.0134469771657, -0.0253166383449, 0.034601411305, 0.0498026591865, 0.0147863382594, 0.00386504623682, 0.0390216596728, -0.00752600517262, -0.0056334421425, -0.00703149668339, 0.00373049639187, -0.0170113080794, 0.031063424262, 0.0070188894023, 0.0343821575866, -0.0296663417, -0.0312533247171, 0.00228560238568, -0.0200869188374, -0.0465861845003, 0.0074148532855, 0.00778328908485, -0.00239365097105, 0.0211159963864, -0.0101738550143, 0.0221153313227, 0.0120745140318, 0.0424117786748, 0.0165123207562, 0.0253632881829, -0.00550136829826, -0.0046292213177, -0.00315092809438, 0.0301160712503, 0.0286312131224, -0.00819090713118, 0.0315111166998, -0.0521589858068, -0.0185275914154, -0.0358262298568, -0.00163053741214, -0.0287416500401, 0.0688113221837, -0.0023933144002, -0.00661116226021, -0.0542106744341, -0.042784808944, 0.0173215712105, -0.0392196811015, 0.0207339212476, -0.045454334704, 0.0202154719455, 0.0148105214512, -0.0230338012693, -0.00534712562521, 0.00555274339065, 0.0615223702941, -0.0200281754886, -0.00964783123172, -0.0201680715981, 0.0208189487588, 0.0628993663653, 0.0416043175912, 0.0156599702638, -0.0322385656012, -0.00482255604814, 0.00254091812215, 0.0152901086879, 0.0125741794363, 0.0232558694026, 0.00977020134127, -0.0398008707529, -0.0268964956381, -0.0527130463474, -0.0045785862603, -8.81968703034e-05, 0.0017808194297, 0.0146972449831, -0.0137502902062, 0.0158736394279, -0.0144204195937, -0.0440793011077, 0.0375157611199, -0.0096092059652, 0.0218906905165, 0.0113600669209, -0.00797593406881, -0.0214742360207, -0.0616331922445, 0.0487977233648, 0.0330232714136, -0.00752373395462, -0.0259014603795, 0.0103778013875, 0.0239906506747, 0.0274886993092, 0.0506120174333, 0.00933604771703, 0.0128722536389, -0.0386741243019, -0.0118985580667, 0.0011901461201, -0.0113642014771, 0.0315076351745, -0.0281116305346, 0.00891916810051, -0.0147883250651, 0.00791639698605, 0.00577264543299, 0.0299472318902, -0.0123017107691, 0.032010269977, 0.0192380685421, 0.0164582142479, -0.0396823525632, 0.0113088908103, 0.000822630160239, -0.0116380734661, 0.00471031440102, 0.00026584501753, 0.0303592525458, -0.00817147089427, -0.0135122516922, -0.011605032986, 0.000867605642953, 0.00356259474523, -0.0235009916937, 0.0134360592212, 0.0341675841633, 0.0037336383048, -0.0208792924897, 0.0075391599431, 0.0224859524829, 0.0249201303198, -0.0146172814441, 0.00406734321071, 0.0284210897642, -0.00847374310539, -0.0114720316915, 0.00500263694258, 0.01710240779, 0.0224897147971, -0.0165403397364, 0.0151036858467, 0.000461956295288, -0.0347643848776, -0.032390318598, 0.00450104543445, 0.0306297994042, 0.0232547549099, 0.0366407833988, -0.0198361948654, 0.00208582107582, -0.00264045939376, -0.0184410263176, -0.00599621175053, 0.00565406205562, 0.00757286425264, 0.00812967830282, -0.0256624787586, -0.0125366291289, -0.0216738674731, -0.00673287071219, 0.00241703352636, -0.0251406132051, 0.00625283173327, -0.000327143380255, -0.00424268394253, 0.0432970311364, -0.00291893736096, -0.00374888213984, -0.0305832174626, 0.018082182327, -0.0180131334066, -0.00272588309469, 0.0281102383074, 0.0327907474723, 0.0484505299177, -0.0395410396888, -0.000570024697794, 0.0294096917174, -0.00845491482633, -0.00239607065153, -0.00302720417788, -0.0317094611294, -0.0197224314562, 0.0282222047311, 0.0166029197893, -0.0213904366637, 0.0349518921965, 0.0189417489144, -0.0135698765814, 0.0269692942335, -0.00598382574847, -5.3245989314e-05, 0.017780171752, -0.0215918489997, 0.00190475192889, 0.0269761121872, 0.055998928314, -0.0320044967321, -0.015664074917, 0.0138504464738, 0.0120902940124, 0.0156187886592, -0.00421459494215 ]