def __init__(self, policy_cls, env_id, args): logging.getLogger("tensorflow").setLevel(logging.ERROR) self.args = args self.env = gym.make(env_id, **args2envkwargs(args)) self.policy_with_value = policy_cls(self.args) self.iteration = 0 if self.args.mode == 'training': self.log_dir = self.args.log_dir + '/evaluator' else: self.log_dir = self.args.test_log_dir if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.preprocessor = Preprocessor((self.args.obs_dim, ), self.args.obs_preprocess_type, self.args.reward_preprocess_type, self.args.obs_scale, self.args.reward_scale, self.args.reward_shift, gamma=self.args.gamma) self.writer = self.tf.summary.create_file_writer(self.log_dir) self.stats = {} self.eval_timer = TimerStat() self.eval_times = 0
def read(self, filename=None, preprocess=True, **defines): """Preprocess, read and parse itp file *filename*. Any keywords in *defines* are use to modify the default preprocessor variables (see :meth:`gromacs.fileformats.preprocessor.Preprocessor.parse` for details). Setting *preprocess* = ``False`` skips the preprocessing step. """ self._init_filename(filename) if preprocess: kwargs = self.defines.copy() kwargs['commentchar'] = self.commentchar kwargs['clean'] = True ppitp = Preprocessor(self.real_filename, **kwargs) ppitp.parse(**defines) itp = ppitp.StringIO() else: itp = open(self.real_filename) try: stream = OneLineBuffer(itp.next) self.parse(stream) finally: itp.close()
def processData(self): """ The purpose of this method is to process both train/test raw data """ # Load the preprocessor preprocessor = Preprocessor() if self.train: filename = self.parameters['data-path'] + self.parameters[ 'train-data-filename'] else: filename = self.parameters['data-path'] + self.parameters[ 'test-data-filename'] # read the required file data_df = pd.read_json(path_or_buf=filename, lines=True) # concatenate response and last 'n' contexts together data_df['CONTEXT'] = data_df['context'].apply( lambda x: ' '.join(x[-self.n_last_context:])) data_df['text'] = data_df['CONTEXT'] + ' ' + data_df['response'] data_df['text'] = data_df['text'].apply( lambda x: preprocessor.process_text_bert(x)) # save the processed data if self.train: filename = self.parameters['processed-data-path'] + self.parameters[ 'processed-train-data-filename'] data_df[['text', 'label']].to_csv(filename) else: filename = self.parameters['processed-data-path'] + self.parameters[ 'processed-test-data-filename'] data_df[['text']].to_csv(filename) return
def load_data(batch_size): ''' Loads training, validation and test data from resources. ''' data_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '../resources')) data_config = { "training": { "size": 0.8 }, "test": { "size": 0.1 }, "validation": { "size": 0.1 } } p = Preprocessor(base_path=data_path, datasets=data_config) train_data = p.generate_images('training', shuffle=True, batch_size=batch_size) valid_data = p.generate_images('validation', shuffle=False, batch_size=batch_size) class_weights = p.get_class_weights() return train_data, valid_data, class_weights
def run_imputation(df): """ Fill in missing numeric values using Kalman Filtering, and fill in missing null """ # Create the dateIndex time_cols = ['year', 'month', 'day', 'hour'] df["timestamp"] = pd.to_datetime(df[time_cols]) df.set_index("timestamp", inplace=True) df.drop(columns=time_cols, inplace=True) # Check if there are null values in the dataset and get the columns nulls = df.isnull().sum() null_cols = nulls[nulls > 0].index.values numeric_null_cols = get_numeric_null_cols(df, null_cols) obj_null_cols = get_string_null_cols(df, null_cols) # Use Kalman Filtering to impute missing numeric # values for col in numeric_null_cols: prep = Preprocessor() arr = prep.kalman_impute(df[col]) df[col] = arr # Backfill any missing data at the beginning of the array if df[col].isnull().sum(): df[col].fillna(method="bfill", inplace=True) # Random draw based on distribution of # unique vals in each column for col in obj_null_cols: arr = fill_missing_strings(df[col]) df[col] = arr return df
def testTennisOrIris(trainDataFile, testDataFile, attrDataFile): data = Preprocessor(trainDataFile, testDataFile, attrDataFile) data.loadData() trainData = data.getMatrix(data.getTrainData()) testData = data.getMatrix(data.getTestData()) numInput = data.getNumInput() numOutput = len(data.getClasses()) numHidden = 3 seed = 4 learningRate = 0.1 maxEpochs = 5000 momentum = 0.0 print("Generating neural network: %d-%d-%d" % (numInput, numHidden,numOutput)) nn = NeuralNetwork(numInput, numHidden, numOutput, seed) nn.train(trainData, maxEpochs, learningRate, momentum) print("Training complete") # accTrain = nn.accuracy(trainData) accTest = nn.accuracy(testData) # print("\nAccuracy on train data = %0.4f " % accTrain) print("Accuracy on test data = %0.4f " % accTest)
def __init__(self): self._rows = 0 self._cols = 0 self._params = {} self._model = LogisticRegression(max_iter=20) self._preprocessor = Preprocessor() self.init_params()
def test_search_lines(self): """Various test cases for preprocessor.search_lines.""" preprocessor = Preprocessor(self.empty_dataframe, self.config, '_INFO_') # These lines should get deleted no_matches = ['not matching text', 'should get deleted'] self.assertFalse(preprocessor.search_lines(no_matches)) # The first line should be kept since it explicitly matches # to all RE in USEFUL_INFORMATION # The second line is removed since although it matches to error, # it does not match to USEFUL_INFORMATION single_match = [ 'this error is USEFUL_INFORMATION', 'but this error is not' ] self.assertEqual(preprocessor.search_lines(single_match), ['this error is USEFUL_INFORMATION']) # Both lines should be kept since both are explicitly matched to # by both regular expressions in search_lines multi_match = [ 'this error is USEFUL_INFORMATION', 'of course that error is USEFUL_INFORMATION!' ] self.assertEqual(preprocessor.search_lines(multi_match), multi_match)
def __init__(self, left_filename, right_filename, directory, config): super(TestPredictionCallback, self).__init__() self.directory = directory # Crop and expand dims to batch = 1 crop_start_row = config['crop_start_row'] crop_start_col = config['crop_start_col'] crop_stop_row = crop_start_row + config['crop_height'] crop_stop_col = crop_start_col + config['crop_width'] preprocessor = Preprocessor() img = img_to_array(load_img(left_filename), data_format='channels_first') img = img[:, crop_start_row:crop_stop_row, crop_start_col:crop_stop_col] left_img = preprocessor.resize_img(img, [ config['channels'], config['resized_height'], config['resized_width'] ]) img = img_to_array(load_img(right_filename), data_format='channels_first') img = img[:, crop_start_row:crop_stop_row, crop_start_col:crop_stop_col] right_img = preprocessor.resize_img(img, [ config['channels'], config['resized_height'], config['resized_width'] ]) self.left_img = np.expand_dims(left_img, axis=0) self.right_img = np.expand_dims(right_img, axis=0)
def test_restore_matrix_2(self): missing_value = -999999 pre = Preprocessor(missing_value=missing_value) threshold = 1e-5 header = ['col1', 'col2'] x = np.random.randint(low=0, high=2, size=(5, 2)).astype(str) v = np.full(shape=x.shape, fill_value=False) m = pre.get_metadata(arr=x, header=header) obj_d = pre.get_discretized_matrix(arr=x, meta=m, header=header, require_missing=True) obj_r = pre.restore_matrix(arr=obj_d['x'], meta=m, header=obj_d['header']) for i in range(x.shape[0]): for j in range(x.shape[1]): if m[j]['type'] == 'count' or m[j]['type'] == 'continuous': if abs(float(x[i, j]) - float(obj_r['x'][i, j])) < threshold: v[i, j] = True else: if x[i, j] == obj_r['x'][i, j]: v[i, j] = True assert v.all()
def test_restore_matrix_4(self): missing_value = -999999 pre = Preprocessor(missing_value=missing_value) threshold = 1e-5 obj_f = self.create_multimodal_object(n=1000) v = np.full(shape=obj_f['x'].shape, fill_value=False) m = pre.get_metadata(obj_f['x'], obj_f['header']) obj_d = pre.get_discretized_matrix(arr=obj_f['x'], meta=m, header=obj_f['header'], require_missing=True) obj_r = pre.restore_matrix(arr=obj_d['x'], meta=m, header=obj_d['header']) for i in range(obj_f['x'].shape[0]): for j in range(obj_f['x'].shape[1]): if m[j]['type'] == 'count' or m[j]['type'] == 'continuous': if abs(float(obj_f['x'][i, j]) - float(obj_r['x'][i, j])) < threshold: v[i, j] = True else: if obj_f['x'][i, j] == obj_r['x'][i, j]: v[i, j] = True assert v.all()
def test_get_variable_type_constant_str(self): pre = Preprocessor(missing_value=-999999) x = np.full(fill_value='hello world', shape=10000) var_type = pre.get_variable_type(arr=x, label='my_feature') assert var_type == 'constant'
def test_get_variable_type_continuous(self): pre = Preprocessor(missing_value=-999999) x = np.random.random(1000) var_type = pre.get_variable_type(arr=x, label='my_feature') assert var_type == 'continuous'
def test_get_variable_type_constant_num(self): pre = Preprocessor(missing_value=-999999) x = np.zeros(shape=1000) var_type = pre.get_variable_type(arr=x, label='my_feature') assert var_type == 'constant'
def main(_): pre_processor = Preprocessor() pre_processor.set_train_test_data(0.8) model = Model('winner_predict_model') model.learning_rate = 0.01 model.sess = tf.Session() model.builder(team_input_size=pre_processor.team_input_size, player_input_size=pre_processor.player_input_size, output_size=pre_processor.output_size, model_name='model_builder') model.run_train(train_epoch=5000, train_x_home_team=pre_processor.train_x_home_team, train_x_away_team=pre_processor.train_x_away_team, train_x_home_player=pre_processor.train_x_home_player, train_x_away_player=pre_processor.train_x_away_player, train_y=pre_processor.train_y, keep_prob=0.7, print_num=500) model.run_test(test_x_home_team=pre_processor.test_x_home_team, test_x_away_team=pre_processor.test_x_away_team, test_x_home_player=pre_processor.test_x_home_player, test_x_away_player=pre_processor.test_x_away_player, test_y=pre_processor.test_y) model.closer()
def __init__(self, conf_path, template_path): configuration = Configuration(conf_path) self.preprocessor = Preprocessor(configuration) self.poco_processor = PocoProcessor(configuration) self.ros_mapper_processor = RosMapperProcessor(configuration) self.ros_msg_processor = RosMsgProcessor(configuration) self.dds_mapper_processor = DdsMapperProcessor(configuration) self.dds_idl_processor = DdsIdlProcessor(configuration) self.zmq_serializer_processor = ZmqSerializerProcessor(configuration) self.node_handler_processor = NodeHandlerProcessor(configuration) env = Environment(loader=FileSystemLoader(template_path)) self.poco_template = env.get_template('poco_template.h') self.ros_mapper_template = env.get_template('ros_mapper_template.h') self.ros_msg_template = env.get_template('ros_template.msg') self.dds_mapper_template = env.get_template('dds_mapper_template.h') self.dds_idl_template = env.get_template('dds_template.idl') self.zmq_serializer_template = env.get_template( 'zmq_serializer_template.h') self.node_handler_template = env.get_template( 'node_handler_template.js')
def test_parent_class(self): configuration = Configuration(CONF_PATH) preprocessor = Preprocessor(configuration) ros_msg_processor = RosMsgProcessor(configuration) class_definition_dict = {} kidl_file = "class_with_ros_mdlw_and_parent_class.yaml" with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream: try: class_definition_data = yaml.load(stream, Loader=yaml.FullLoader) except yaml.YAMLError as exc: print(exc) class_definition = preprocessor.process(class_definition_data, False) class_definition_dict[class_definition.class_name] = class_definition kidl_file = "basic_class_with_ros_mdlw.yaml" with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream: try: class_definition_data = yaml.load(stream, Loader=yaml.FullLoader) except yaml.YAMLError as exc: print(exc) class_definition = preprocessor.process(class_definition_data, False) class_definition_dict[class_definition.class_name] = class_definition ros_msg_definition = ros_msg_processor.process( 'kpsr::codegen::ClassWithParentClass', class_definition_dict) print(ros_msg_definition)
def preprocessData(self, train_data, test_data): # Preprocessor preprocessor = Preprocessor() # Make preprocessing path if it doesnt exist if not os.path.exists(self.preprocessing_path): os.mkdir(self.preprocessing_path) # Check if preprocessing training artifact exists if os.path.exists(os.path.join(self.preprocessing_path, 'train_data.txt')): # Load train data if it does train_data = open(os.path.join(self.preprocessing_path, 'train_data.txt')).read().splitlines() else: # Preprocess the data as specified in the config file for step in self.config['preprocessing']: train_data = preprocessor.process(step, train_data) # Save the training data artifact with open(os.path.join(self.preprocessing_path, 'train_data.txt'), 'w+') as f: # Write the array with each datapoint on a new line f.write('\n'.join(train_data)) f.close() # Check if preprocessing testing artifact exists if os.path.exists(os.path.join(self.preprocessing_path, 'test_data.txt')): # Load test data if it does test_data = open(os.path.join(self.preprocessing_path, 'test_data.txt')).read().splitlines() else: # Preprocess the data as specified in the config file for step in self.config['preprocessing']: test_data = preprocessor.process(step, test_data) # Save the testing data artifact with open(os.path.join(self.preprocessing_path, 'test_data.txt'), 'w+') as f: # Write the array with each datapoint on a new line f.write('\n'.join(test_data)) f.close() return train_data, test_data
def test_word_filter(self): """Tests pertaining to preprocessor.filter_words.""" preprocessor = Preprocessor(self.empty_dataframe, self.config, '_INFO_') sample_string = 'Some error information here, testIgnoreWord' self.assertEqual(preprocessor.filter_words(sample_string), 'Some error information here, ')
def main(mode, other_args): if mode != 'build' and mode != 'detect': raise Exception('Unknown execution mode: {}'.format(mode)) with open("config/config.yml", 'r') as ymlfile: cfg = yaml.load(ymlfile) td = TrendDetector(cfg) if mode == 'build': # Build model sl = SearchLoader(cfg) df = sl.load() pp = Preprocessor(df) agg_df = pp.run() td.build(agg_df) # Detect trending for all queries on the last day max_date = agg_df['date'].max() for _, row in agg_df[agg_df['date'] == max_date].iterrows(): query = row['query'] count = row['count'] td.is_trending(query, count) else: # 'detect' mode # Load model td.load_model() # Detect trending for the given query and search count query = other_args.query obs = other_args.obs td.is_trending(query, obs, verbose=True)
def test_basic(self): configuration = Configuration(CONF_PATH) preprocessor = Preprocessor(configuration) ros_msg_processor = RosMsgProcessor(configuration) class_definition_dict = {} kidl_file = "basic_class_with_ros_mdlw.yaml" with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream: try: class_definition_data = yaml.load(stream, Loader=yaml.FullLoader) except yaml.YAMLError as exc: print(exc) class_definition = preprocessor.process(class_definition_data, False) class_definition_dict[class_definition.class_name] = class_definition ros_msg_definition = ros_msg_processor.process('BasicClass', class_definition_dict) env = Environment( loader=FileSystemLoader(TEMPLATE_PATH) ) template = env.get_template('ros_template.msg') print(template.render(definition=ros_msg_definition))
def externalVoodoo(input, output, linkTo, pathToRemoveFromIdentifier="", trace=False): inputLines = _readLinesOfFile(input) perFileSettings = PerFileSettings(inputLines) preprocessor = Preprocessor(linkTo, output, inputLines, pathToRemoveFromIdentifier) out = preprocessor.externalHeader() out += '#include "VoodooConfiguration.h"\n' out += '#include <VoodooCommon/Common.h>\n\n' out += "namespace External\n{\n\n" iterator = VoodooMultiplexerIterator(perFileSettings) iterator.process(input) out += iterator.iter() out += "\n}\n\n" out += preprocessor.externalSwitchToExpectation() out += '#include "VoodooCommon/All.h"\n\n' out += "namespace External\n{\n\n" out += iterator.expect() out += "\n}\n\n" out += preprocessor.externalFooter() return out
def main(testpath, path_to_result): with open('config.json') as f:#load config config = json.load(f) logging.info('loading embedding...') with open('embedding.pkl', 'rb') as f: embedding = pickle.load(f) config['model_parameters']['embedding'] = embedding.vectors#load embedding preprocessor = Preprocessor(None) preprocessor.embedding = embedding#update embedding used by preprocessor logging.info('Processing test from test.pkl') test = preprocessor.get_dataset(testpath, 6, {'n_positive': -1, 'n_negative': -1, 'shuffle': False})#get dataset test.shuffle = False PredictorClass = ExamplePredictor predictor = PredictorClass(metrics=[], **config['model_parameters'])#make model logging.info('loading model from {}'.format('model.pkl.2'))#load model predictor.load('model.pkl.4') logging.info('predicting...') predicts = predictor.predict_dataset(test, test.collate_fn)#predicting write_predict_csv(predicts, test, path_to_result)#save csv
def __init__(self, environment, agent, train, action_freq=1): self.env = environment self.agent = agent self.prep = Preprocessor( self.env.get_dim(Preprocessor.NB_STATE_HISTORY)) self.trainer = agent.get_trainer() if train else None self.action_freq = action_freq
def validate(model: Model, loader: DataLoaderIAM, line_mode: bool) -> Tuple[float, float]: """Validates NN.""" print('Validate NN') loader.validation_set() preprocessor = Preprocessor(get_img_size(line_mode), line_mode=line_mode) num_char_err = 0 num_char_total = 0 num_word_ok = 0 num_word_total = 0 while loader.has_next(): iter_info = loader.get_iterator_info() print(f'Batch: {iter_info[0]} / {iter_info[1]}') batch = loader.get_next() batch = preprocessor.process_batch(batch) recognized, _ = model.infer_batch(batch) print('Ground truth -> Recognized') for i in range(len(recognized)): num_word_ok += 1 if batch.gt_texts[i] == recognized[i] else 0 num_word_total += 1 dist = editdistance.eval(recognized[i], batch.gt_texts[i]) num_char_err += dist num_char_total += len(batch.gt_texts[i]) print('[OK]' if dist == 0 else '[ERR:%d]' % dist, '"' + batch.gt_texts[i] + '"', '->', '"' + recognized[i] + '"') # print validation result char_error_rate = num_char_err / num_char_total word_accuracy = num_word_ok / num_word_total print(f'Character error rate: {char_error_rate * 100.0}%. Word accuracy: {word_accuracy * 100.0}%.') return char_error_rate, word_accuracy
def voodoo(input, output, pathToRemoveFromIdentifier, voodooDBFile, includes, defines, preIncludes, trace=False): inputLines = _readLinesOfFile(input) perFileSettings = PerFileSettings(inputLines) preprocessor = Preprocessor(input, output, inputLines, pathToRemoveFromIdentifier) out = preprocessor.header() out += '#include <VoodooCommon/Common.h>\n\n' iterator = VoodooMultiplexerIterator(perFileSettings, voodooDBFile) iterator.process(input, includes=includes, defines=defines, preIncludes=preIncludes) out += iterator.iter() out += preprocessor.switchToExpectation() out += '#include "VoodooCommon/All.h"\n\n' out += iterator.expect() out += preprocessor.footer() return out
def __init__(self, data_dir, coord, symbol_list, year_range, symbol_first, data_win_len, receptive_field, queue_size=500): # system initialize self.db_manager = DBManager(data_dir) self.preprocessor = Preprocessor() self.coord = coord self.threads = [] # processing params self.data_dir = data_dir self.symbol_list = symbol_list self.year_range = year_range self.symbol_first = symbol_first self.data_win_len = data_win_len self.receptive_field = receptive_field # queue setup self.trans_placeholder = tf.placeholder(dtype=tf.float32, shape=None) self.trans_queue = tf.PaddingFIFOQueue(queue_size, ['float32'], shapes=[(None, 1)]) self.trans = self.trans_queue.enqueue([self.trans_placeholder]) # for multithreading: self.yield_list = itertools.product( self.symbol_list, self.year_range) if self.symbol_first else itertools.product( self.year_range, self.symbol_list)
def load(self, filename): param_dict = dict() with open(filename, 'rb+') as f: param_dict = pickle.load(f) self.min_word_counts = param_dict['min_word_counts'] self.dtype = param_dict['dtype'] self.max_df = param_dict['max_df'] self.min_df = param_dict['min_df'] self.vocabulary = param_dict['vocabulary'] self.word_to_ind = param_dict['word_to_ind'] self.ngram_range = param_dict['ngram_range'] self.doc_cleaner_pattern = param_dict['doc_cleaner_pattern'] self.token_pattern = param_dict['token_pattern'] self.stop_words = param_dict['stop_words'] self.document_cleaner_func = param_dict['document_cleaner_func'] self.tokenizer_func = param_dict['tokenizer_func'] self.token_cleaner_func = param_dict['token_cleaner_func'] self.preprocessor = Preprocessor( doc_cleaner_pattern=self.doc_cleaner_pattern, token_pattern=self.token_pattern, document_cleaner_func=self.document_cleaner_func, tokenizer_func=self.tokenizer_func, token_cleaner_func=self.token_cleaner_func, stop_words=self.stop_words) self.preprocessor.fit()
def __init__(self, model_name="test.hdf5"): print('Starting test of {}'.format(model_name)) models_path = path.abspath(path.join( __file__, "../../..")) + "/models/" + model_name print(models_path) self.model = load_model(models_path) self.preprocessor = Preprocessor()
def setUpClass(self): self.DEBUG = False self.METRICS = False self.data_api_impl = DataApi('../../../data/') self.cross_validator_impl = CrossValidator() self.preprocessor_impl = Preprocessor()
def process(self,file): Preprocessor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) pos = cqpf.getColumn(self.column) for i in range(2,len(pos)): # ignore first two pos ... uni = (pos[i])[0:3] bi = (pos[i-1])[0:3] + "_" + uni tri = (pos[i-2])[0:3] + "_" + bi if uni not in self.unilexicon: self.unilexicon[uni] = 0 self.unilexicon[uni] += 1 if bi not in self.bilexicon: self.bilexicon[bi] = 0 self.bilexicon[bi] += 1 if tri not in self.trilexicon: self.trilexicon[tri] = 0 self.trilexicon[tri] += 1 self.count += 1
def test_with_builder(self): configuration = Configuration(CONF_PATH) preprocessor = Preprocessor(configuration) poco_processor = PocoProcessor(configuration) class_definition_dict = {} kidl_file = "basic_class_with_builder.yaml" with open("%s%s" % (INCLUDE_PATH, kidl_file), 'r') as stream: try: class_definition_data = yaml.load(stream, Loader=yaml.FullLoader) except yaml.YAMLError as exc: print(exc) class_definition = preprocessor.process(class_definition_data, False) class_definition_dict[class_definition.class_name] = class_definition poco_definition = poco_processor.process('BasicClassWithBuilder', class_definition_dict, '') env = Environment(loader=FileSystemLoader(TEMPLATE_PATH)) template = env.get_template('poco_template.h') print(template.render(definition=poco_definition))
def __init__(self): self.unilexicon = {} self.bilexicon = {} self.trilexicon = {} self.count = 0 self.column = 1 Preprocessor.__init__(self)
def process(self,file): Preprocessor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) for word in cqpf.getColumn(self.column): if word not in self.lexicon: self.lexicon[word] = 0 self.lexicon[word] += 1 self.count += 1
def _test_bg_subtraction2(self): p = Preprocessor(10) s = p.load_npy('./test.npy') generator = DataPreparator("", "", 512) samples1=len(s[0,:]) snew,sbg = generator.bg_subtraction(s) samples2=len(snew[0,:]) self.assertGreater(samples1, samples2)
def voodooExpectHeader( input, output, pathToRemoveFromIdentifier, voodooDBFile, includes, defines, preIncludes, trace = False ): inputLines = _readLinesOfFile( input ) perFileSettings = PerFileSettings( inputLines ) preprocessor = Preprocessor( input, output, inputLines, pathToRemoveFromIdentifier ) iterator = VoodooMultiplexerIterator( perFileSettings, voodooDBFile ) iterator.process( input, includes = includes, defines = defines, preIncludes = preIncludes ) out = preprocessor.headerOfHeader() + '\n' return out
def main(args): print("Athene Preprocessor v. 0.1") if (len(args) > 2): source_filename, target_filename, include_filenames = parse_args(args) if ("" == source_filename or "" == target_filename): print("arguments error") preprocessor = Preprocessor(source_filename, target_filename, include_filenames) preprocessor.run() print("ok!") else: print("Usage:") print("\tathp -s source -t target [-i file file file]")
def track(self): print sys.argv cam = cv2.VideoCapture(int(sys.argv[1])) cam.set(cv2.cv.CV_CAP_PROP_FRAME_WIDTH, 640) cam.set(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT, 480) self._initialize_windows() p = Preprocessor() hs = HandSegment() positions = [] self.count = 0 self.skip_frames = 0 x, y, w, h = 0, 0, 0, 0 prev_x, prev_y, prev_w, prev_h = 0, 0, 0, 0 while True: frame = self.get_frame(cam) if type(frame) == type(None): continue p.process(frame) hand = self.get_biggest_hand(frame, prev_x, prev_y, prev_w, prev_h) # print hand if not hand == []: x, y, w, h = hand prev_x, prev_y, prev_w, prev_h = hand centerx = x + w / 2 centery = y + h / 2 # Drawing rectangle around the hand cv2.rectangle(frame, (x, y), (x+w, y+w), (0, 0, 0), 1) # pointerx, pointery = hs.get_pointer(frame, x, y, w, h) # cv2.imshow("pointer", frame[max(y-h, 0):y+h, x:x+w+w/4]) else: x, y, w, h = -1, -1, prev_w, prev_h centerx = -1 centery = -1 positions.append([centerx, centery]) # Action skip_frames = self.motion(positions, w, h) # Drawing line of motion self._draw_motion(frame, positions) cv2.imshow("display", frame) ch = 0xFF & cv2.waitKey(1) if ch == 27: break cv2.destroyAllWindows()
def voodoo( input, output, pathToRemoveFromIdentifier, voodooDBFile, includes, defines, preIncludes, trace = False ): inputLines = _readLinesOfFile( input ) perFileSettings = PerFileSettings( inputLines ) preprocessor = Preprocessor( input, output, inputLines, pathToRemoveFromIdentifier ) out = preprocessor.header() out += '#include <VoodooCommon/Common.h>\n\n' iterator = VoodooMultiplexerIterator( perFileSettings, voodooDBFile ) iterator.process( input, includes = includes, defines = defines, preIncludes = preIncludes ) out += iterator.iter() out += preprocessor.switchToExpectation() out += '#include "VoodooCommon/All.h"\n\n' out += iterator.expect() out += preprocessor.footer() return out
def main(): # create lexer and parser instances: lexicon_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "lexicon") grammar_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "grammar") lexer = Lexer(lexicon_file, False) parser = Parser(grammar_file, lexer.lexicon_dict.keys()) # run tests: for test in tests: # create preprocessor instance preprocessor_instance = Preprocessor(prefix + test) chunks = preprocessor_instance.get_chunks() ok = try_parse_program(chunks, lexer, parser) print("test " + test + " " + ("PASSED" if ok else "FAILED"))
def __init__(self, data, label, verbose=False, verbosity_level = 5): print('Preprocessing data...') self.__prep = Prep() for item in data: self.__data.append(self.__prep.process(item)) self.__label = label self.__precision = [] self.__verbose = verbose self.__level = verbosity_level
def process(self,file): Preprocessor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) pos = cqpf.getColumn(self.column) for i in range(2,len(pos)): # ignore first two pos ... uni = (pos[i])[0:3] bi = (pos[i-1])[0:3] + "_" + uni tri = (pos[i-2])[0:3] + "_" + bi self.counts[self.posdict[uni]][self.filecount] += 1 self.counts[self.posdict[bi]][self.filecount] += 1 self.counts[self.posdict[tri]][self.filecount] += 1 self.count += 1 for x in self.posnames: self.counts[self.posdict[x]][self.filecount] /= float(len(pos)-3) self.filecount += 1
def contains_preprocessor_constructs(self): """Check if file makes use of any preprocessor constructs. The test is done by running the file through the :class:`~gromacs.fileformats.preprocessor.Preprocessor` (while stripping all empty and lines starting with a comment character. This is compared to the original file, stripped in the same manner. If the two stripped files differ from each other then the preprocessor altered the file and preprocessor directives must have been involved and this function returns ``True``. .. versionadded: 0.3.1 """ from itertools import izip kwargs = self.defines.copy() kwargs['commentchar'] = self.commentchar kwargs['clean'] = True kwargs['strip'] = True ppitp = Preprocessor(self.real_filename, **kwargs) ppitp.parse() pp_lines = ppitp.StringIO().readlines() def strip_line(line): s = line.strip() return len(s) == 0 or s.startswith(self.commentchar) raw_lines = [line for line in open(self.real_filename) if not strip_line(line)] if len(pp_lines) != len(raw_lines): self.logger.debug("File %r is preprocessed (pp: %d vs raw %d lines (stripped))", self.real_filename, len(pp_lines), len(raw_lines)) return True for linenum, (raw, pp) in enumerate(izip(raw_lines, pp_lines)): if raw != pp: self.logger.debug("File %r is preprocessed. Difference at (stripped) line %d", self.real_filename, linenum) self.logger.debug("preprocessed: %s", pp) self.logger.debug("original: %s", raw) return True self.logger.debug("File %r does not appear to contain recognized preprocessing directives", self.real_filename) return False
def get_important_vars(cfg, dat): ''' This method does Feature Selection. ''' # Balances the dataset idxs_pos = dat[cfg['target']] == 1 pos = dat[idxs_pos] neg = dat[dat[cfg['target']] == 0][1:sum(idxs_pos)] # Concatenates pos and neg, it's already shuffled sub_dat = pos.append(neg, ignore_index = True) # Imputes the data and fills in the missing values sub_dat = Preprocessor.fill_nans(sub_dat) # Changes categorical vars to a numerical form X = pd.get_dummies(sub_dat) #### Correlation-based Feature Selection #### # Computes correlation between cfg['target'] and the predictors target_corr = X.corr()[cfg['target']].copy() target_corr.sort(ascending = False) # Sorts and picks the first x features # TODO: get optimal x value automatically tmp = abs(target_corr).copy() tmp.sort(ascending = False) important_vars = [tmp.index[0]] important_vars.extend(list(tmp.index[2:52])) # removes other target #### Variance-based Feature Selection #### #sel = VarianceThreshold(threshold = 0.005) #X_new = sel.fit_transform(X) #### Univariate Feature Selection #### #y = X.TARGET_B #X = X.drop("TARGET_B", axis = 1) #X_new = SelectKBest(chi2, k = 10).fit_transform(X.values, y.values) #### Tree-based Feature Selection #### #clf = ExtraTreesClassifier() #X_new = clf.fit(X.values, y.values).transform(X.values) #aux = dict(zip(X.columns, clf.feature_importances_)) #important_vars = [i[0] for i in sorted( # aux.items(), key = operator.itemgetter(0))] return important_vars
def externalVoodoo( input, output, linkTo, pathToRemoveFromIdentifier = "", trace = False ): inputLines = _readLinesOfFile( input ) perFileSettings = PerFileSettings( inputLines ) preprocessor = Preprocessor( linkTo, output, inputLines, pathToRemoveFromIdentifier ) out = preprocessor.externalHeader() out += '#include "VoodooConfiguration.h"\n' out += '#include <VoodooCommon/Common.h>\n\n' out += "namespace External\n{\n\n" iterator = VoodooMultiplexerIterator( perFileSettings ) iterator.process( input ) out += iterator.iter() out += "\n}\n\n" out += preprocessor.externalSwitchToExpectation() out += '#include "VoodooCommon/All.h"\n\n' out += "namespace External\n{\n\n" out += iterator.expect() out += "\n}\n\n" out += preprocessor.externalFooter() return out
def _createWidgets(self): self.SetBackgroundColour((60,60,60)) self.SetForegroundColour((230,230,230)) self.processSysIncCb = wx.CheckBox(self, -1, u"Process #include <...> files") self.processSysIncCb.SetBackgroundColour((100,100,100)) sysIncDirs, appIncDirs = Preprocessor.getDefaultIncDirs() self._createSysIncWidgets(sysIncDirs) self._createAppIncWidgets(appIncDirs) self._createPredefMacroWidgets() self._createSaveOptionWidgets()
def __init__(self, files_path='', classes={}, out_file='output.csv'): # self.mi_terms looks like this {'term1': {'d': 3, 't': 4, 'mi': 2, },} self.mi_terms = {} # self.mi_classes looks like this {'d': 3, 't': 4} self.mi_classes = {} self.total_terms_count = 0 # Some configuration self.files_path = files_path self.out_file = out_file self.classes = classes self.files_prefixes = classes.keys() self.class_names = [classes[prefix] for prefix in classes] # For tokenizing, stemming, etc. self.prep = Preprocessor(pattern='\W+', lower=True, stem=False, stemmer_name='porter', pos=False, ngram=1)
def __init__(self, valid_actions, run_id, display_screen, skip_frames, game_ROM): """ Initialize ALE class. Creates the FIFO pipes, launches ./ale and does the "handshake" phase of communication @param display_screen: bool, whether to show the game on screen or not @param skip_frames: int, number of frames to skip in the game emulator @param game_ROM: location of the game binary to launch with ./ale """ self.display_screen = display_screen self.skip_frames = skip_frames self.game_ROM = game_ROM self.run_id = run_id #: create FIFO pipes os.mkfifo("ale_fifo_out_%i" % self.run_id) os.mkfifo("ale_fifo_in_%i" % self.run_id) #: launch ALE with appropriate commands in the background command='./ale/ale -max_num_episodes 0 -game_controller fifo_named -disable_colour_averaging true -run_length_encoding false -frame_skip '+str(self.skip_frames) + ' -run_id ' + str(self.run_id) + ' -display_screen '+self.display_screen+" "+self.game_ROM+" &" os.system(command) os.system('ls -l ale_fifo_out_%i' % self.run_id) os.system('ls -l ale_fifo_in_%i' % self.run_id) #: open communication with pipes self.fin = open('ale_fifo_out_%i' % self.run_id) self.fout = open('ale_fifo_in_%i' % self.run_id, 'w') input = self.fin.readline()[:-1] size = input.split("-") # saves the image sizes (160*210) for breakout #: first thing we send to ALE is the output options- we want to get only image data # and episode info(hence the zeros) self.fout.write("1,0,0,1\n") self.fout.flush() # send the lines written to pipe #: initialize the variables that we will start receiving from ./ale self.next_image = [] self.game_over = True self.current_points = 0 self.actions = [self.all_actions[i] for i in valid_actions] #: initialise preprocessor self.preprocessor = Preprocessor()
def _createPredefMacroWidgets(self): style = wx.LC_REPORT#|wx.LC_VRULES #|wx.LC_HRULES self.predefMacroLc = wx.ListCtrl(self, -1, style=style) self.predefMacroLc.InsertColumn(0, 'Name') self.predefMacroLc.InsertColumn(1, 'Value') self.predefMacroLc.SetBackgroundColour((30,30,30)) self.predefMacroLc.SetForegroundColour((30,30,30)) f = self.predefMacroLc.GetFont() f.SetFaceName("Monospace") self.predefMacroLc.SetFont(f) for name, val in sorted(Preprocessor.getPredefMacros().items(), key=lambda i: i[0]): idx = self.predefMacroLc.InsertStringItem(sys.maxint, name) self.predefMacroLc.SetStringItem(idx, 1, val) self.predefMacroLc.SetItemTextColour(idx, (255, 255, 255)) self.predefAddBtn = wx.Button(self, -1, u"Add") self.predefEditBtn = wx.Button(self, -1, u"Edit") self.predefDelBtn = wx.Button(self, -1, u"Delete")
def __init__(self, label_path,label_bg_path, meta_path, training_description): self.label_path = label_path self.label_bg_path = label_bg_path self.meta_path = meta_path self.training_description = training_description if not os.path.isdir(self.training_description): os.mkdir(self.training_description) self.batch_size = 64 self.queue_size = 2048 self.nr_epoch = 10 self.preprocessor = Preprocessor(10) self.augmenter = AugmentTransform(10, 10) self.inverse_labels = {} self.inverse_labels_bg = {} self.train_val_ratio = 0.1
def __init__(self, memory, display_screen="true", skip_frames=4, game_ROM='../libraries/ale/roms/breakout.bin'): """ Initialize ALE class. Creates the FIFO pipes, launches ./ale and does the "handshake" phase of communication @param memory: memoryD, reference to the instance of class memoryD that collects all transitions in the game @param display_screen: bool, whether to show the game on screen or not @param skip_frames: int, number of frames to skip in the game emulator @param game_ROM: location of the game binary to launch with ./ale """ self.display_screen = display_screen self.skip_frames = skip_frames self.memory = memory self.game_ROM = game_ROM #: create FIFO pipes os.system("mkfifo ale_fifo_out") os.system("mkfifo ale_fifo_in") #: launch ALE with appropriate commands in the background command='./../libraries/ale/ale -max_num_episodes 0 -game_controller fifo_named -disable_colour_averaging true -run_length_encoding false -frame_skip '+str(self.skip_frames)+' -display_screen '+self.display_screen+" "+self.game_ROM+" &" os.system(command) #: open communication with pipes self.fin = open('ale_fifo_out') self.fout = open('ale_fifo_in', 'w') input = self.fin.readline()[:-1] size = input.split("-") # saves the image sizes (160*210) for breakout #: first thing we send to ALE is the output options- we want to get only image data # and episode info(hence the zeros) self.fout.write("1,0,0,1\n") self.fout.flush() # send the lines written to pipe #: initialize the variables that we will start receiving from ./ale self.next_image = [] self.game_over = True self.current_reward = 0 #: initialise preprocessor self.preprocessor = Preprocessor()
def setUp(self): self.mock_metadata_helper = MagicMock(spec=MetadataHelper) self.mock_image_open_patcher = patch('preprocessor.Image.open') self.mock_image_open = self.mock_image_open_patcher.start() self.mock_image = MagicMock() self.mock_exif_data = 'a bunch of exif data' self.mock_image.info = { 'exif': self.mock_exif_data } self.mock_image_open.return_value = self.mock_image self.mock_first_transposed_image = MagicMock() self.mock_image.transpose.return_value = self.mock_first_transposed_image self.mock_second_transposed_image = MagicMock() self.mock_first_transposed_image.transpose.return_value = self.mock_second_transposed_image self.test_model = Preprocessor(self.mock_metadata_helper)
def _load_text_preprocessor(self, args): """ Load the preprocessor for the context """ self._update_status(6) self.text_preprocessor = Preprocessor() # Load preprocessors based on type of model ## TF-IDF: if "tfidf" in self.options["preproc_type"]: print "Loading TF-IDF model..." with open(args.tfidfmodel, "rb") as f_tfidf, open(args.svdmodel, "rb") as f_svd: tfidf_model = pkl.load(f_tfidf) svd_model = pkl.load(f_svd) if "with_svd" in self.options["preproc_params"] else None self.text_preprocessor.set_tfidf(tfidf_model, svd_model) ## Word2Vec: if "w2v" in self.options["preproc_type"]: print "Loading Word2Vec model..." w2v_model = Word2Vec.load_word2vec_format(args.w2vmodel, binary=True) self.text_preprocessor.set_w2v(w2v_model) ## Raw: if "raw" in self.options["preproc_type"]: print "Loading counter model..." with open(args.rawmodel, "rb") as f_raw: raw_model = pkl.load(args.rawmodel) self.text_preprocessor.set_raw(raw_model)
if( check == 9 ): return True else: return False if __name__ == '__main__': import sys, os logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # check and process cmdline input program = os.path.basename(sys.argv[0]) if len(sys.argv) < 5: print "Usage: python preprocessor.py -infile -synset_list -vocab_filename -outputfilename " sys.exit(1) infile = sys.argv[1] #synset filename S = sys.argv[2] #vocab filename F = sys.argv[3] outfile = sys.argv[4] from preprocessor import Preprocessor # for pickle #from gensim.models import Preprocessor # for pickle from gensim.models.word2vec import Text8Corpus sentences = Text8Corpus(infile) prep = Preprocessor(sentences,F,S) prep.prep_text(8,sentences,outfile)
class ALE: actions = [np.uint8(0), np.uint8(1), np.uint8(3), np.uint8(4), np.uint8(11), np.uint8(12)] current_points = 0 next_screen = "" game_over = False skip_frames = None display_screen = "true" game_ROM = None fin = "" fout = "" preprocessor = None def __init__(self, display_screen, skip_frames, game_ROM): """ Initialize ALE class. Creates the FIFO pipes, launches ./ale and does the "handshake" phase of communication @param display_screen: bool, whether to show the game on screen or not @param skip_frames: int, number of frames to skip in the game emulator @param game_ROM: location of the game binary to launch with ./ale """ self.display_screen = display_screen self.skip_frames = skip_frames self.game_ROM = game_ROM #: create FIFO pipes os.system("mkfifo ale_fifo_out") os.system("mkfifo ale_fifo_in") #: launch ALE with appropriate commands in the background command='./../libraries/ale/ale -max_num_episodes 0 -game_controller fifo_named -disable_colour_averaging true -run_length_encoding false -frame_skip '+str(self.skip_frames)+' -display_screen '+self.display_screen+" "+self.game_ROM+" &" os.system(command) #: open communication with pipes self.fin = open('ale_fifo_out') self.fout = open('ale_fifo_in', 'w') input = self.fin.readline()[:-1] size = input.split("-") # saves the image sizes (160*210) for breakout #: first thing we send to ALE is the output options- we want to get only image data # and episode info(hence the zeros) self.fout.write("1,0,0,1\n") self.fout.flush() # send the lines written to pipe #: initialize the variables that we will start receiving from ./ale self.next_image = [] self.game_over = True self.current_points = 0 #: initialise preprocessor self.preprocessor = Preprocessor() def new_game(self): """ Start a new game when all lives are lost. """ #: read from ALE: game screen + episode info self.next_image, episode_info = self.fin.readline()[:-2].split(":") self.game_over = bool(int(episode_info.split(",")[0])) self.current_points = int(episode_info.split(",")[1]) #: send the fist command # first command has to be 1,0 or 1,1, because the game starts when you press "fire!", self.fout.write("1,0\n") self.fout.flush() self.fin.readline() #: preprocess the image and add the image to memory D using a special add function #self.memory.add_first(self.preprocessor.process(self.next_image)) return self.preprocessor.process(self.next_image) def end_game(self): """ When all lives are lost, end_game adds last frame to memory resets the system """ #: tell the memory that we lost # self.memory.add_last() # this will be done in Main.py #: send reset command to ALE self.fout.write("45,45\n") self.fout.flush() self.game_over = False # just in case, but new_game should do it anyway def move(self, action_index): """ Sends action to ALE and reads responds @param action_index: int, the index of the chosen action in the list of available actions """ #: Convert index to action action = self.actions[action_index] #: Generate a random number for the action of player B action_b = random.choice(range(255)) #: Write and send to ALE stuff self.fout.write(str(action)+","+str(action_b)+"\n") #print "sent action to ALE: ", str(action)+",0" self.fout.flush() #: Read from ALE line = self.fin.readline() try: self.next_image, episode_info = line[:-2].split(":") #print "got correct info from ALE: image + ", episode_info except: print "got an error in reading stuff from ALE" traceback.print_exc() print line exit() self.game_over = bool(int(episode_info.split(",")[0])) self.current_points = int(episode_info.split(",")[1]) return self.current_points, self.preprocessor.process(self.next_image)
from scipy.cluster.hierarchy import linkage, fcluster from dionysus import PairwiseDistances,ExplicitDistances import numpy as np def bench_cluster(X, y, pca_n_comp): n = len(np.unique(y)) pca = PCA(pca_n_comp) X_ = pca.fit_transform(X) sc = SpectralClustering(n) km = KMeans(n) sc_pred = sc.fit_predict(X_) km_pred = km.fit_predict(X_) distances = PairwiseDistances(X_.tolist()) distances = ExplicitDistances(distances) singlel_pred = fcluster(linkage(ssd.squareform(distances.distances)), n, criterion='maxclust') print "single-linkage clustering prediction:", singlel_pred print "single-linkage clustering score:", adjusted_rand_score(y, singlel_pred), mutual_info_score(y, singlel_pred) print "spectral clustering prediction:", sc_pred print "spectral clustering score:", adjusted_rand_score(y, sc_pred), mutual_info_score(y, sc_pred) print "kmeans clustering prediction", km_pred print "kmeans clustering score:", adjusted_rand_score(y, km_pred), mutual_info_score(y, km_pred) print "ground truth labels", y if __name__ == "__main__": funcs = [word_lengths_funcs, sentence_lengths_funcs, ratio_most_n_common_words, ratio_length_of_words_texts, lambda text: ratio_length_of_words_texts(text, 8, ge)] pp = Preprocessor(Prepreprocessor, funcs, use_tfidf=20) X, y = pp.process(['../data/abstracts/', '../data/sports', '../data/reviews']) bench_cluster(X, y, 3)
def classify(self) : t1 = time.time() # Schedule a crawl job with the query try : crawler = Search(self.search_query) crawler.googleSearch() except Exception as e : print e print "Error in initializing Google search" t2 = time.time() print "Google search done in " + str(t2-t1) + " secs" # Extract data crawled try : crawler.get_crawled_urls() except Exception as e : print e print "Error in extracting crawl data" t3 = time.time() print "Test data extraction done in " + str(t3-t2) + " secs" # Preprocess test data try : preproc_test = Preprocessor(crawler.all_urls) preproc_test.preprocessor_main() except Exception as e : print e print "Error in preprocessing crawl data" t4 = time.time() print "Test data preprocessing done in " + str(t4-t3) + " secs" # Send a search request to Dig server with the query dig_search = Dig_Search(self.search_query) dig_search.search_request() t5 = time.time() print "Dig Search done in " + str(t5-t4) + " secs" # Extract results returned by search query dig_search.dig_extraction() t6 = time.time() print "Dig extraction done in " + str(t6-t5) + " secs" # Preprocess the search results try : preproc_train = Preprocessor(dig_search.urls_dig) preproc_train.preprocessor_main() dig_search.filter_dig_result(preproc_train.data) except Exception as e : print e print "Error in preprocessing training data" t7 = time.time() print "Training data preprocessing done in " + str(t7-t6) + " secs" # Compute tfidf vectors of data try : tfidf_train = Tfidf_Vectorize(dig_search.urls_dig) tfidf_train.tfidf_vectorize_train() tfidf_train.tfidf_vectorize_test(preproc_test.data) except Exception as e : print e print "Error in computing tfidf vectorization" t9 = time.time() print "Tfidf computation done in " + str(t9-t7) + " secs" # Compute similarity of training data with its centroid vector try : sim_train = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_train) similarity_train = sim_train.similarity_main() except Exception as e : print e print "Error in computing cosine similarity" t10 = time.time() print "Training data similarity computation done in " + str(t10-t9) + " secs" # Compute similarity of test data with training data try : sim_test = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_test) similarity_test = sim_test.similarity_main() except Exception as e : print e print "Error in computing cosine similarity" t11 = time.time() print "Similarity computation done in " + str(t11-t10) + " secs" print "Total time = " + str(t11-t1) evaluator = Evaluation(similarity_train, similarity_test) urls_classified = evaluator.compare_similarity(preproc_test) classified_output = self.formatOutput(urls_classified) return classified_output
def classify(self) : t1 = time.time() # Schedule a crawl job with the query try : crawler = Search(self.search_query) crawler.googleSearch() except Exception as e : print "Error in initializing Google search" t2 = time.time() print "Google search done in " + str(t2-t1) + " secs" # Extract data crawled try : crawler.get_crawled_urls() except Exception as e : print "Error in extracting crawl data" t3 = time.time() print "Test data extraction done in " + str(t3-t2) + " secs" # Preprocess test data try : preproc_test = Preprocessor(crawler.all_urls) preproc_test.preprocessor_main() except Exception as e : print e print "Error in preprocessing crawl data" t4 = time.time() print "Test data preprocessing done in " + str(t4-t3) + " secs" # Send a search request to Dig server with the query dig_search = Dig_Search(self.search_query) dig_search.search_request() t5 = time.time() print "Dig Search done in " + str(t5-t4) + " secs" # Extract results returned by search query dig_search.dig_extraction() t6 = time.time() print "Dig extraction done in " + str(t6-t5) + " secs" # Preprocess the search results try : preproc_train = Preprocessor(dig_search.urls_dig) preproc_train.preprocessor_main() dig_search.filter_dig_result(preproc_train.data) except Exception as e : print e print "Error in preprocessing training data" t7 = time.time() print "Training data preprocessing done in " + str(t7-t6) + " secs" # Compute tfidf vectors of data try : tfidf_train = Tfidf_Vectorize(dig_search.urls_dig) tfidf_train.tfidf_vectorize_train() tfidf_train.tfidf_vectorize_test(preproc_test.data) except Exception as e : print e print "Error in computing tfidf vectorization" t9 = time.time() print "Tfidf computation done in " + str(t9-t7) + " secs" # Compute similarity of training data with its centroid vector try : sim_train = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_train) similarity_train = sim_train.similarity_main() except Exception as e : print e print "Error in computing cosine similarity" t10 = time.time() print "Training data similarity computation done in " + str(t10-t9) + " secs" # Compute similarity of test data with training data try : sim_test = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_test) similarity_test = sim_test.similarity_main() except Exception as e : print e print "Error in computing cosine similarity" t11 = time.time() print "Similarity computation done in " + str(t11-t10) + " secs" print "Total time = " + str(t11-t1) evaluator = Evaluation(similarity_train, similarity_test) similarity_count = evaluator.compare_similarity(preproc_test) avg_train_similarity = numpy.mean(similarity_train) epsilon = 0.4 * avg_train_similarity classifier_output = open("output/" + self.search_query.replace(' ','_') + "2.html","w") urls_classified = [] tfidf_tr = tfidf_train.tfidf_centroid_train tfidf_tr = sorted(tfidf_tr, key= lambda tfidf : tfidf[1], reverse=True) for sim in similarity_count : url_desc = {} url_desc['Test_url'] = "<a href='"+preproc_test.data[sim[0]]['url']+"''>"+preproc_test.data[sim[0]]['url']+"</a>" if sim[1] >= (avg_train_similarity-epsilon) : url_desc['Classifier Output'] = True else : url_desc['Classifier Output'] = False url_desc['Similarity Score'] = sim[1] url_desc['Average Training Similarity'] = avg_train_similarity tfidf_url = tfidf_train.tfidf_test[sim[0]] tfidf_url = sorted(tfidf_url, key= lambda tfidf : tfidf[1], reverse=True) url_desc['Top Test Keywords'] = ", ".join([tfidf[0] for tfidf in tfidf_url[0:20]]) urls_classified.append(url_desc) _json2conv = {"" : urls_classified} classifier_output.write("<html><h2 align='center' style='text-decoration:underline'>Classifier Output</h3><h2 align='center'>Query : "+self.search_query+"</h2><h2 align='center'>Top Train Keywords : "+", ".join([tfidf[0] for tfidf in tfidf_tr[0:20]])+"</h2><body>"+ json2html.convert(json=_json2conv, table_attributes="border=2, cellspacing=0, cellpadding=5, text-align='center'") + "</body></html>") classifier_output.close()
def analyze(snd_pipe, db_path, pp_cfg, parser_cfg, srcFiles, use_pipeline=False, analyzer_process=1, pp_process=1, parser_process=1): db = DatabaseManager() pp_list = [Preprocessor(**pp_cfg) for i in range(pp_process if use_pipeline else analyzer_process)] parser_list = [Parser(**parser_cfg) for i in range(parser_process if use_pipeline else analyzer_process)] numFiles = len(srcFiles) use_pipeline = use_pipeline t_0 = datetime.datetime.now() projInfo = {} projInfo['predefined'] = pp_list[0].preprocess_predef() task_queue = Queue() done_queue = Queue() for i, srcFile in enumerate(srcFiles): task_queue.put(srcFile) for i in range(len(pp_list)): task_queue.put('STOP') if not use_pipeline: analyzer_p_list = [Process(target=analyzer_worker, args=(pp, parser, task_queue, done_queue)) for pp, parser in zip(pp_list, parser_list)] for analyzer_p in analyzer_p_list: analyzer_p.start() for i, srcFile in enumerate(srcFiles): #print 'analyze: [%d/%d]' % (i,numFiles), srcFile projInfo[srcFile] = done_queue.get() snd_pipe.send((i, numFiles, srcFile)) if snd_pipe.poll(): for analyzer_p in analyzer_p_list: analyzer_p.terminate() for analyzer_p in analyzer_p_list: analyzer_p.join() Preprocessor.clearTokenCache() snd_pipe.send('STOPPED') print 'analyze: canceled' return for analyzer_p in analyzer_p_list: analyzer_p.join() else: pp_queue = Queue() pp_p_list = [Process(target=preprocessor_worker, args=(pp, task_queue, pp_queue)) for pp in pp_list] for pp_p in pp_p_list: pp_p.start() parser_p_list = [Process(target=parser_worker, args=(parser, pp_queue, done_queue)) for parser in parser_list] for parser_p in parser_p_list: parser_p.start() for i, srcFile in enumerate(srcFiles): #print 'analyze: [%d/%d]' % (i,numFiles), srcFile projInfo[srcFile] = done_queue.get() snd_pipe.send((i, numFiles, srcFile)) if snd_pipe.poll(): for pp_p in pp_p_list: pp_p.terminate() for parser_p in parser_p_list: parser_p.terminate() for pp_p in pp_p_list: pp_p.join() for parser_p in parser_p_list: parser_p.join() Preprocessor.clearTokenCache() snd_pipe.send('STOPPED') print 'analyze: canceled' return for i in range(len(parser_p_list)): pp_queue.put('STOP') for pp_p in pp_p_list: pp_p.join() for parser_p in parser_p_list: parser_p.join() t_1 = datetime.datetime.now() db.createDB(db_path) db.addData(projInfo) db.saveDB() db.closeDB() print 'analyze: done', t_1 - t_0 snd_pipe.send((numFiles, numFiles, 'Generating Database ... done'))
def __init__(self): self.readNames() self.count = 0 self.column = 1 Preprocessor.__init__(self)