def __data_generation(self, indices): x = np.empty((self.batch_size, *self.img_size)) if self.labels is not None: # training phase if self.n_classes == 2: y = np.empty((self.batch_size, ), dtype=np.float32) else: y = np.empty((self.batch_size, self.n_classes), dtype=np.float32) for i, idx in enumerate(indices): image = Preprocessor.preprocess(self.img_dir + self.list_ids[idx] + ".dcm") if self.labels.iloc[idx]['any'] == 1: image = self.augment_funcs[random.randint( 0, self.n_augment)](image) image = np.array(image) image = np.repeat(image[..., np.newaxis], 3, -1) x[i, ] = image if self.n_classes == 2: y[i, ] = self.labels.iloc[idx]['any'] elif self.n_classes == 5: y[i, ] = self.labels.iloc[idx, 1:] else: y[i, ] = self.labels.iloc[idx] return x, y else: # test phase for i, idx in enumerate(indices): image = Preprocessor.preprocess(self.img_dir + self.list_ids[idx] + ".dcm") image = np.repeat(image[..., np.newaxis], 3, -1) x[i, ] = image return x
def process_data(data): neutralIndices = [] emptyIndices = [] labels = data.iloc[:, 0].tolist() for i in range(len(labels)): if labels[i] == -2: data.at[i, 2] = -1 elif labels[i] == 2: data.at[i, 2] = 1 #elif labels[i] == 0: # neutralIndices.append(i) #processed = data.drop(neutralIndices) processed = data #upsample class multiplier = upsample_multiplier(processed, -1, 1) isNeg = processed[2] == -1 df_try = processed[isNeg] data2 = processed.append([df_try] * multiplier, ignore_index=True) multiplier = upsample_multiplier(processed, 0, 1) isNeut = processed[2] == 0 df_try = processed[isNeut] data2 = data2.append([df_try] * multiplier, ignore_index=True) labels2 = data2.iloc[:, 0].tolist() #for i in range(len(labels2)): # if labels2[i] == -1: # data2.at[i,2] = 0 p = Preprocessor() texts = data2.iloc[:, 1].tolist() processed_texts = p.preprocess(texts) labels = data2.iloc[:, 0].tolist() write_csv(processed_texts, labels)
def main(): pp = Preprocessor() print 'processing custom data, computing bows...' tdpath = 'dataset/test/sms-data' pp.process_custom_data(tdpath) fm = FeatureModel() print 'converting custom data to fvs...' fm.compute_custom_fv_matrix('custom') tdpath = 'bin_data/custom_fv.npy' cpath = 'bin_data/mnb-classifier.npy' data = np.load('bin_data/custom-data.npy').item() tester = Tester(tdpath,cpath) print 'predicting labels for custom data...' results = tester.predict_labels_for_custom_data(data) with open('output/results.txt','w') as textfile: for msg in results: line = '%s -> %s\n' % (msg,results[msg]) textfile.write(line) textfile.close() print 'Results written to results.txt'
def __init__(self): self.preProcessor = Preprocessor() self.lstmSize = 256 self.lstmLayers = 1 self.batchSize = 500 self.learningRate = 0.0005 self.seqL = 100
def __init__(self, repo_path, model_type, use_translated_data, term_similarity_type, lang_code, link_threshold_interval=5, output_sub_dir="", print_result=False, github_projects_dir=default_git_dir): """ :param repo_path: the repo path in github :param model_type: vsm, gvsm, lda :param use_translated_data: whether use translated data or not :param term_similarity_type: for gvsm only. :param link_threshold_interval: The sample rate for threshold :param output_sub_dir: the sub directory for results under Experiment2/result/. Group the experiment by the time running the script """ self.git_projects_dir = github_projects_dir self.use_translated_data = use_translated_data self.model_type = model_type self.repo_path = repo_path self.lang_code = lang_code self.data_dir = os.path.join(self.git_projects_dir, repo_path) self.preprocessor = Preprocessor() self.preprocessed_dataset() # Create clean tokens if not exist self.link_threshold_interval = link_threshold_interval self.term_similarity_type = term_similarity_type self.output_sub_dir = output_sub_dir
def runModel(self, testSize, debug): self.trainVectorizer() d = self.getXy('train.tsv') if debug: X_train, X_test, y_train, y_test = train_test_split( d['X'], d['y'], test_size=testSize, random_state=5) else: X_train = d['X'] y_train = d['y'] d_test = self.getXy('test.tsv') X_test = d_test['X'] urlid = d_test['urlid'] self.fit(X_train, y_train) print "20 Fold CV Score: ", np.mean( cross_val_score(self.model, d['X'], d['y'], cv=10, scoring='roc_auc')) y_predicted = self.predict(X_test) if debug: print 'Topic Model AUC Score: %f' % roc_auc_score( y_test, y_predicted) else: Pre = Preprocessor() Pre.generateSubmission('submission_12.csv', urlid, y_predicted) P.figure() P.hist(y_predicted, bins=100) P.show()
def runModel(self, testSize, debug): self.trainVectorizer() d = self.getXy('train.tsv') if debug: X_train, X_test, y_train, y_test = train_test_split(d['X'], d['y'], test_size=testSize, random_state=5) else: X_train = d['X'] y_train = d['y'] d_test = self.getXy('test.tsv') X_test = d_test['X'] urlid = d_test['urlid'] self.fit(X_train, y_train) print "20 Fold CV Score: ", np.mean(cross_val_score(self.model, d['X'], d['y'], cv=10, scoring='roc_auc')) y_predicted = self.predict(X_test) if debug: print 'Topic Model AUC Score: %f' % roc_auc_score(y_test, y_predicted) else: Pre = Preprocessor() Pre.generateSubmission('submission_12.csv', urlid, y_predicted) P.figure() P.hist(y_predicted, bins=100) P.show()
def integrate_images(self): self.judge_user_input_or_not() image_type_list = list() # image_type_list.append(self.var_char1.get()) # image_type_list.append(self.var_char2.get()) # image_type_list.append(self.var_char3.get()) # image_type_list.append(self.var_char4.get()) csv_handler = CSVParser(self.csv_input_path.get()) data_dict = csv_handler.get_dict_from_csv() integrate_handler = Preprocessor(self.image_input_path.get(), self.text_output_path.get()) # group_number = integrate_handler.get_group_number_all() self.img_group_number.set(u"Total number: " + str(len(data_dict)) + u" groups") # keyword_dict = integrate_handler.get_keyword_dict() for k, v in data_dict.iteritems(): return_message = integrate_handler.integrate_images(v) if len(return_message) > 0: self.t_show.insert( END, u"Ref no. " + return_message + u" has not been finished!\n")
class TestPreprocessor(unittest.TestCase): def setUp(self): cur_dir = os.path.dirname(os.path.realpath(__file__)) self.tpl_dir = os.path.join(cur_dir,'..','UnitTests','test_templates') self.preprocessor = Preprocessor(self.tpl_dir) def process_content(self,content,dict,expected): actual = self.preprocessor.process_content_with_dict(content,dict) self.assertEqual(actual,expected) def test_ifs(self): dict = {'basic_key':'_trivial_', 'yes_key':True, 'no_key':False} self.process_content("basic%basic_key%basic",dict,"basic_trivial_basic") self.process_content("%if yes_key%%basic_key%%endif%",dict,"_trivial_") self.process_content("%if no_key%%basic_key%%endif%",dict,"") self.process_content("aa%if yes_key%%if no_key%%basic_key%%endif%%endif%bb",dict,"aabb") self.process_content("start %if non_existant_key%non_existant%endif% finish",dict,"start finish") self.process_content("start %if non_existant_key%%if yes_key%%basic_key%%endif%%endif% finish",dict,"start finish") self.process_content("start %if yes_key%yes%else%no%endif%",dict,"start yes") self.process_content("start %if no_key%yes%else%no%endif%",dict,"start no") self.process_content("start %if non_existant_key%yes%else%non_exist%endif%",dict,"start non_exist") self.process_content("%if yes_key%%if no_key%yes%else%no%endif%%endif%",dict,"no") self.process_content("%if no_key% hello1 %else% %if yes_key% hello2 %else% hello3 %endif% %endif%",dict," hello2 ") def test_includes(self): content = self.preprocessor.process_tpl_name_with_dict('test_include',{'a':True,'aa':'a'}) self.assertEqual(content,'<html><body> a </body></html>')
class TestPreprocessor(unittest.TestCase): def setUp(self): self.pp = Preprocessor() def testPreporcessNull(self): #Null String should return none result = self.pp.clean('') self.assertEquals(result, None) def testPreporcessOneHiragana(self): #One hiragana should return None result = self.pp.clean(u'あw(^^)w') self.assertEquals(result, None) def testPreporcessZenkaku(self): #Zenkaku stuff should be converted and cleaned result = self.pp.clean(u'全角です123ww') self.assertEquals(result, u'全角です123') def testPreprocessNakano(self): #Test to see if the Preprocessing work as intended test_string = u'私の名前は中野ですwwww>あふぉ(^^)o' result = self.pp.clean(test_string) expected = u'私の名前は中野です' self.assertEquals(result, expected)
def get_preprocessor(config, features): print("Fitting preprocessor...") preprocessor = Preprocessor(normalize=config["normalize"], reduce_features=config["reduce_features"], reducer_type=config["reducer_type"], explained_variance=config["explained_variance"]) preprocessor.train(features) return preprocessor
def __init__(self, outputFormat='flat', useJarfileManifest=True, useChromeManifest=False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor()
def __init__(self): self.root = Tk() self.root.title('New') self.file = None self.textChanged = False self.idh = IDHolder() self.pre = Preprocessor() self.lxa = LexAnalyzer(self.idh) self.mdc = MidCoder(self.idh)
def evaluate_text(self, text): pre = Preprocessor() entry = TextEntry() entry.body = text pre.entries = [entry] predict = self.clf.predict(pre.get_clean_data()) if self.clf2 is not None: predict2 = self.clf2.predict(pre.get_clean_data()) return (predict + predict2) / 2 return predict
def create_config_file(self, path, extra=None): '''Creates the given config file. A config file is generated by taking the corresponding source file and replacing occurences of "@VAR@" by the value corresponding to "VAR" in the substs dict. Additional substs are defined according to the file being treated: "srcdir" for its the path to its source directory "relativesrcdir" for its source directory relative to the top "DEPTH" for the path to the top object directory ''' input = self.get_input(path) pp = Preprocessor() pp.context.update(self.substs) pp.context.update(top_srcdir=self.get_top_srcdir(path)) pp.context.update(srcdir=self.get_file_srcdir(path)) pp.context.update(relativesrcdir=self.get_relative_srcdir(path)) pp.context.update(DEPTH=self.get_depth(path)) if extra: pp.context.update(extra) pp.do_filter('attemptSubstitution') pp.setMarker(None) pp.out = FileAvoidWrite(path) pp.do_include(input) return pp.out.close()
def preprocess(self, path_sample, dbnum, week_num, week_stride): logging.info('........................... Preprocessing Data ..................................') self.week_num = week_num self.week_stride = week_stride self.eval_week = range(self.week_num+1, self.week_num+1+self.week_stride) logging.info('Fitting evaluator with week splits at %d', week_num) self.preprocessor = Preprocessor(path_sample, dbnum, self.weight_post_lookup, self.weight_user_lookup, self.start_date, verbose=self.verbose) # preprocessor.preview_interaction_distribution() # Return every interaction incurred before cutoff week as the training interaction # every interaction incurred after cutoff week as the testing interaction # post interactions are user2post, user interactions are user2user self.train_post_inter, self.test_post_inter, \ self.train_user_inter, self.test_user_inter = self.preprocessor.partition_data(self.week_num) logging.info('At week %d, recommend for upto week %d', self.week_num, self.eval_week) # users = [pid for pid in self.train_post_inter.keys()] # Note that this gives all users, though some have 0 interactions self.users = set(self.train_post_inter.keys()).union(set(self.test_post_inter.keys())) logging.info('- Currently, %d users played in this forum', len(self.users)) logging.info('- in total, %d users are enrolled in this forum', len(self.users)) logging.info('Calculating the overall interactions...') self.all_post_inters = {pid: pd.concat([self.train_post_inter[pid], self.test_post_inter[pid]]) for pid in self.users} #print([len(x) for x in self.all_post_inters.values()]) all_post_inters_df = pd.DataFrame() for pid, df in self.all_post_inters.items(): df['PersonID'] = pid all_post_inters_df = all_post_inters_df.append(df) self.all_post_inters_df = all_post_inters_df #logging.debug(self.all_post_inters_df.Weeknum.unique()) # Essentially a dictionary of dictionary, keys are the [pid]'s, # because each [pid] correspond to a differnt test set for pid in self.users: self.eval_nids_per_person[pid] = self.extract_evaluation_inters_for_pid(pid) all_notes = set() num_active_users = 0 for pid, inter in self.train_post_inter.items(): all_notes = all_notes.union(set(inter['NoteID'].unique())) if len(inter): num_active_users += 1 logging.info('- the forum currently has %d posts', len(all_notes)) logging.info('- %d users have made interactions', num_active_users) self.unrec_noteids = self.preprocessor.get_unsharable_posts() self.hierarchy = self.preprocessor.hierarchy self.all_note_contents = self.preprocessor.all_note_contents
def trainer(op, file): p = Preprocessor() X_train, labels_train, X_test, labels_test = p.load_data() Y_train = one_hot_matrix(labels=labels_train, C=10) Y_test = one_hot_matrix(labels=labels_test, C=10) X_train = X_train.T / 255 X_test = X_test.T / 255 model(X_train, Y_train, X_test, Y_test, op, file)
def train(): fileLoader = FileLoader("data/orginal", "data/result") files = fileLoader.getFilePairs() samples = [] print("Sample extracting") for file in files: sampleExtracter = SampleExtracter(file[0], file[1], 10) samples += sampleExtracter.getSamples() print("Preprocessing") p = Preprocessor(samples) samples = p.getTrainingData() c = Classifier(samples[:100000], 10)
def __init__(self, isReversed=False, isStop=False, isStem=False, default=None): dict.__init__(self) self.default = default self._isReversed = isReversed self._isStop = isStop self._isStem = isStem self.__pre = Preprocessor(isReversed=self._isReversed, isStop=self._isStop, isStem=self._isStem)
def __init__(self, outputFormat = 'flat', useJarfileManifest = True, useChromeManifest = False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor() self.topsourcedir = None self.sourcedirs = [] self.localedirs = None self.l10nbase = None self.l10nmerge = None self.relativesrcdir = None self.rootManifestAppId = None
def pictures_html_block(self): pictures_preprocessor = Preprocessor(tpl_dir) dict = { "img_size_l": "1836x2448", "img_size_m": "1224x1632", "img_size_s": "612x816", "img_width_t": 150, "img_height_t": 150, } text = "" for i in xrange(1, 16): text = text + pictures_preprocessor.process_tpl_name_with_dict("asset", dict) return text
def album_list_html_block(self): albums_preprocessor = Preprocessor(tpl_dir) dict = { "number_of_pictures": 152, "album_name": "My Photos", "album_share_href": "/album.html", "img_width_t": 150, "img_height_t": 150, "poster_image_src": "/test_image.jpeg", } text = "" for i in xrange(1, 4): text = text + albums_preprocessor.process_tpl_name_with_dict("album_list_item", dict) return text
def create_config_file(self, path, extra=None): """Creates the given config file. A config file is generated by taking the corresponding source file and replacing occurences of "@VAR@" by the value corresponding to "VAR" in the substs dict. Additional substs are defined according to the file being treated: "srcdir" for its the path to its source directory "relativesrcdir" for its source directory relative to the top "DEPTH" for the path to the top object directory """ input = self.get_input(path) pp = Preprocessor() pp.context.update(self.substs) pp.context.update(top_srcdir=self.get_top_srcdir(path)) pp.context.update(srcdir=self.get_file_srcdir(path)) pp.context.update(relativesrcdir=self.get_relative_srcdir(path)) pp.context.update(DEPTH=self.get_depth(path)) if extra: pp.context.update(extra) pp.do_filter("attemptSubstitution") pp.setMarker(None) pp.out = FileAvoidWrite(path) pp.do_include(input) return pp.out.close()
def load_terminal_design_data(raw_dataset_path, grammar_file): graphs = rd.load_graphs(grammar_file) rules = [rd.create_rule_from_graph(g) for g in graphs] all_labels = set() for rule in rules: for node in rule.lhs.nodes: all_labels.add(node.attrs.label) all_labels = sorted(list(all_labels)) preprocessor = Preprocessor(all_labels=all_labels) with open(raw_dataset_path, newline='') as log_file: reader = csv.DictReader(log_file) all_link_features = [] all_link_adj = [] all_results = [] max_nodes = 0 for row in reader: rule_seq = ast.literal_eval(row['rule_seq']) result = float(row['result']) all_results.append(result) # Build a robot from the rule sequence robot_graph = make_initial_graph() for r in rule_seq: matches = rd.find_matches(rules[r].lhs, robot_graph) # Always use the first match robot_graph = rd.apply_rule(rules[r], robot_graph, matches[0]) adj_matrix, link_features, _ = preprocessor.preprocess(robot_graph) all_link_features.append(link_features) all_link_adj.append(adj_matrix) max_nodes = max(max_nodes, adj_matrix.shape[0]) all_adj_matrix_pad, all_link_features_pad, all_masks = [], [], [] for adj_matrix, link_features in zip(all_link_adj, all_link_features): adj_matrix_pad, link_features_pad, masks = preprocessor.pad_graph( adj_matrix, link_features, max_nodes=max_nodes) all_adj_matrix_pad.append(adj_matrix_pad) all_link_features_pad.append(link_features_pad) all_masks.append(masks) return all_link_features_pad, all_adj_matrix_pad, all_masks, all_results
class TestPreprocessor(unittest.TestCase): def setUp(self): cur_dir = os.path.dirname(os.path.realpath(__file__)) self.tpl_dir = os.path.join(cur_dir, '..', 'UnitTests', 'test_templates') self.preprocessor = Preprocessor(self.tpl_dir) def process_content(self, content, dict, expected): actual = self.preprocessor.process_content_with_dict(content, dict) self.assertEqual(actual, expected) def test_ifs(self): dict = {'basic_key': '_trivial_', 'yes_key': True, 'no_key': False} self.process_content("basic%basic_key%basic", dict, "basic_trivial_basic") self.process_content("%if yes_key%%basic_key%%endif%", dict, "_trivial_") self.process_content("%if no_key%%basic_key%%endif%", dict, "") self.process_content( "aa%if yes_key%%if no_key%%basic_key%%endif%%endif%bb", dict, "aabb") self.process_content( "start %if non_existant_key%non_existant%endif% finish", dict, "start finish") self.process_content( "start %if non_existant_key%%if yes_key%%basic_key%%endif%%endif% finish", dict, "start finish") self.process_content("start %if yes_key%yes%else%no%endif%", dict, "start yes") self.process_content("start %if no_key%yes%else%no%endif%", dict, "start no") self.process_content( "start %if non_existant_key%yes%else%non_exist%endif%", dict, "start non_exist") self.process_content( "%if yes_key%%if no_key%yes%else%no%endif%%endif%", dict, "no") self.process_content( "%if no_key% hello1 %else% %if yes_key% hello2 %else% hello3 %endif% %endif%", dict, " hello2 ") def test_includes(self): content = self.preprocessor.process_tpl_name_with_dict( 'test_include', { 'a': True, 'aa': 'a' }) self.assertEqual(content, '<html><body> a </body></html>')
def get_preprocessor_names(): result = [] for clazz in Preprocessor.__subclasses__(): result.append(clazz.get_name()) return result
def get_preprocessor_names(): result = [] for clazz in Preprocessor.__subclasses__(): result.append(clazz.get_name()) return result
def getEdgeDistByMask(self, mask3D, setID, sigma=4.5): result = Preprocessor.loadThresholdMask(setID) #result = generic_gradient_magnitude(result, sobel).astype(np.float32) #result = nd.filters.gaussian_filter(result, sigma) result = morph.distance_transform_cdt(result, metric='taxicab').astype( np.float32) return result[mask3D]
def __init__(self, fo_lang_code): # set up stanford nlp java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload # tokenize,ssplit,pos,lemma,parse,depparse -status_port 9000 -port 9000 -timeout 15000 -serverProperties StanfordCoreNLP- # chinese.properties self.parser = CoreNLPParser() self.fo_lang_code = fo_lang_code self.preprocessor = Preprocessor()
def main(_): # Define data pre-processors load_shape = [80, 80, 3] shape_transfer = [64, 64, 3] crop_sz = (64, 64) preprocessor = Preprocessor(target_shape=load_shape, src_shape=(96, 96, 3)) preprocessor_lin = Preprocessor(target_shape=shape_transfer, src_shape=(96, 96, 3)) # Initialize the data generators data_gen_ssl = STL10('train_unlabeled') data_gen_ftune = STL10('train') data_test = STL10('test') # Define the network and SSL training model = TRCNet(batch_size=FLAGS.batch_size, im_shape=load_shape, n_tr_classes=6, tag=FLAGS.tag, lci_patch_sz=42, lci_crop_sz=48, n_layers_lci=4, ae_dim=48, enc_params={'padding': 'SAME'}) trainer = CINTrainer(model=model, data_generator=data_gen_ssl, pre_processor=preprocessor, crop_sz=crop_sz, wd_class=FLAGS.wd, init_lr_class=FLAGS.pre_lr, num_epochs=FLAGS.n_eps_pre, num_gpus=FLAGS.num_gpus, optimizer='adam', init_lr=0.0002, momentum=0.5, # Parameters for LCI training only train_scopes='features') trainer.train_model(None) # Get the final checkpoint ckpt_dir_model = trainer.get_save_dir() ckpt = wait_for_new_checkpoint(ckpt_dir_model, last_checkpoint=None) print('Found checkpoint: {}'.format(ckpt)) ckpt_id = ckpt.split('-')[-1] # Train linear classifiers on frozen features tag_class = '{}_classifier_ckpt_{}'.format(FLAGS.tag, ckpt_id) model = TRCNet(batch_size=FLAGS.batch_size_ftune, im_shape=shape_transfer, tag=tag_class, feats_ids=['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5'], enc_params={'use_fc': False, 'padding': 'SAME'}) trainer_class = ClassifierTrainer(model=model, data_generator=data_gen_ftune, pre_processor=preprocessor_lin, optimizer='momentum', init_lr=FLAGS.ftune_lr, momentum=0.9, num_epochs=FLAGS.n_eps_ftune, num_gpus=1, train_scopes='classifier') trainer_class.train_model(ckpt) ckpt_dir = trainer_class.get_save_dir() # Evaluate on the test set model.batch_size = 100 tester = ClassifierTester(model=model, data_generator=data_test, pre_processor=preprocessor_lin) acc = tester.test_classifier(ckpt_dir) write_experiments_multi(acc, tag_class, FLAGS.tag)
def __init__(self, sess, params, batch_size=256, sample_size=64, epochs=1000, image_shape=[256, 256, 3], y_dim=None, z_dim=0, gf_dim=128, df_dim=64, gfc_dim=512, dfc_dim=1024, c_dim=3, cg_dim=1, is_train=True, random_seed=4285): self.model_name = "DCGAN.model" self.sess = sess self.batch_size = batch_size self.sample_size = sample_size self.epochs = epochs self.image_shape = image_shape self.image_size = image_shape[0] self.y_dim = y_dim self.z_dim = z_dim self.z = None self.gf_dim = gf_dim """ gf_dim: Dimension of gen (ie decoder of AE) filters in first conv layer. [128] """ self.df_dim = df_dim """ df_dim: Dimension of discrim (ie Dsc + encoder of AE) filters in first conv layer. [64] """ self.gfc_dim = gfc_dim """ as of 28.9: not used """ self.dfc_dim = dfc_dim """ as of 28.9: not used """ self.c_dim = c_dim """ c_dim: Dimension of image color. [3] """ self.cg_dim = cg_dim """ as of 28.9: not used """ self.params = params self.end = False self.random_seed = random_seed self.isIdeRun = 'lz826' in os.path.realpath(sys.argv[0]) self.isTraining = True target_shape = [self.image_size, self.image_size, 3] DCGAN.img_preprocessor = Preprocessor(target_shape=target_shape) self.build_model()
def __init__(self, list_of_files, list_of_models, ratios_list, need_to_make_models = True): if len(list_of_files) != len(list_of_models): raise ValueError("list of files must be same length as list of models as each file needs its own model") for index in range(len(list_of_files)): preprocessor_to_add = Preprocessor(list_of_files[index], list_of_models[index], need_to_create_model = need_to_make_models) self.preprocessor_list.append(preprocessor_to_add) self.ratios_list = ratios_list self.line_number_list = self.preprocessor_list[0].get_random_line_numbers(self.ratios_list)
def get_authors_and_title(text): #print text.encode('utf8') pattern = u'\x14(.*)\x15' m = re.search(pattern, text.split('\n')[0]) all = m.group(1) #print all.encode('utf8') authors, title = Preprocessor.extract_authors(all) return authors,title
def __init__(self, outputFormat='flat', useJarfileManifest=True, useChromeManifest=False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor()
def __init__(self, outputFormat = 'flat', useJarfileManifest = True, useChromeManifest = False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor() self.topsourcedir = None self.sourcedirs = [] self.localedirs = None
def __init__(self, preprocessor=None, model=None): self.preprocessor = preprocessor if self.preprocessor is None: self.preprocessor = Preprocessor() if self.preprocessor.pos_words == []: with open('./preprocessor.pkl', 'rb') as file: (self.preprocessor.pos_words, self.preprocessor.neg_words, self.preprocessor.ohe_dc, self.preprocessor.ohe_out_columns) = pickle.load(file) self.parser = Parser(MONEY) if model is None: self.model = Model() self.model.load('./models') else: self.model = model
def main(config_filename): logger.debug("Starting execution.") parameters = Parameters(config_filename, training_mode=True) if parameters.preprocessed_data: if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file): logger.error("Please, provide a valid Excel file or a valid preprocessed data file.") quit() if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file): logger.info("Loading Excel file.") data_frame = read_excel(parameters.excel_file) logger.info("Creating documents.") docs = data_frame_to_document_list(data_frame) logger.info("Storing generated documents.") pickle_manager.dump_documents(docs, parameters.preprocessed_data_file) logger.info("Preprocessing documents.") preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode) preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file) logger.info("Checking generated data.") pickle_manager.check_data(parameters.preprocessed_data_file) else: if not isfile(parameters.preprocessed_data_file): logger.error("The indicated preprocessed data file does not exist.") quit() logger.info("Extracting features.") feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file) X, y, _lemmas = feature_extractor.generate_X_y(class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file) logger.info("Splitting dataset into training and test subsets.") train_test_split(y, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration) logger.info("Running classifiers.") p = classifiers.Pipeline(parameters.classifiers, parameters.cross_validate) metadata = pickle_manager.get_docs_metadata(parameters.preprocessed_data_file) training_set_indexes = metadata['training_set_indexes'].tolist() test_set_indexes = metadata['test_set_indexes'].tolist() assert len(training_set_indexes) == len(set(training_set_indexes)) assert len(test_set_indexes) == len(set(test_set_indexes)) for elem in feature_extractor.to_remove: try: training_set_indexes.remove(elem) except ValueError: test_set_indexes.remove(elem) logger.info("Accuracies:") p.start(X, y, parameters.number_of_jobs, parameters.set_num_accepted_probs, training_set_indexes, test_set_indexes, parameters.resampling) logger.debug("Execution completed.")
def _init_test_images(self): real_test_image = Preprocessor.preprocess('data/ID_000178e76.dcm') test_images = [ np.zeros((512, 512)), np.ones((512, 512)), np.random.rand(512, 512), real_test_image ] for i in range(len(test_images)): test_images[i] = np.repeat(test_images[i][..., np.newaxis], 3, -1) return np.array(test_images)
def main(): df = load_raw_data(kci_korean_json_filepath) preprocessor = Preprocessor() df = raw2sentences(preprocessor, df) df['flattened_sentences'] = df.apply(lambda x: ' '.join(x['sentences']),axis=1) stopwords = preprocessor.stopwords(df['flattened_sentences'], min_df) print('Extracting nouns..') df['nouns'] = df.apply(lambda x: preprocessor.line2words_nouns(x['flattened_sentences'], stopwords, remove_len=remove_len), axis=1) whole_sentences = preprocessor.flatten_whole_sentences(df, 'nouns') print('# of documents = %d' % len(whole_sentences)) # save as .txt f = open(whole_units_for_train_txt_filepath, 'w') for i in range(len(whole_sentences)): data = "%s\n" % whole_sentences[i] f.write(data) f.close() print('Created file:', whole_units_for_train_txt_filepath) # process text to tensor loader = Quantizer(whole_sentences) word_vocab_size = min(n_words, len(loader.idx2word)) char_vocab_size = min(n_chars, len(loader.idx2char)) max_word_l = loader.max_word_l print('Word vocab size: %d, Char vocab size: %d, Max word length (incl. padding): %d' % (word_vocab_size, char_vocab_size, max_word_l)) log_content = '\n=====\n# of stopwords=%d \n%s\n=====\n# of unique words=%d \n# of unique chars=%d \nmaximum length of a word=%d \n=====\n' % (len(stopwords), str(stopwords), word_vocab_size, char_vocab_size, max_word_l) write_log(log_dir, 'preprocessing_vocab.log', log_content) print('creating an LSTM-CNN with', num_layers, 'layers') model = LSTMCNN(char_vocab_size, char_vec_size, feature_maps, kernels, batch_size, seq_length, max_word_l, batch_norm, highway_layers, num_layers, rnn_size, dropout, word_vocab_size, learning_rate, max_grad_norm) pickle.dump(parameters, open(model_param_pkl_filepath, "wb")) model.save(model_json_filepath) Train, Validation, Test = 0, 1, 2 model.fit_generator(loader.next_batch(Train), loader.split_sizes[Train], max_epochs, loader.next_batch(Validation), loader.split_sizes[Validation], decay_when, learning_rate_decay, save_every, save_epoch_file) model.save_weights(model_weights_h5_filepath, overwrite=True)
def get_new_instance(preprocessor_type): result = None for clazz in Preprocessor.__subclasses__(): if clazz.get_name() == preprocessor_type: result = clazz() break if result is None: raise ValueError("Cannot find retriever of type %s" % (preprocessor_type, )) return result
class TestLineEndings(unittest.TestCase): """ Unit tests for the Context class """ def setUp(self): self.pp = Preprocessor() self.pp.out = StringIO() self.tempnam = os.tempnam('.') def tearDown(self): os.remove(self.tempnam) def createFile(self, lineendings): f = open(self.tempnam, 'wb') for line, ending in zip(['a', '#literal b', 'c'], lineendings): f.write(line+ending) f.close() def testMac(self): self.createFile(['\x0D']*3) self.pp.do_include(self.tempnam) self.assertEquals(self.pp.out.getvalue(), 'a\nb\nc\n') def testUnix(self): self.createFile(['\x0A']*3) self.pp.do_include(self.tempnam) self.assertEquals(self.pp.out.getvalue(), 'a\nb\nc\n') def testWindows(self): self.createFile(['\x0D\x0A']*3) self.pp.do_include(self.tempnam) self.assertEquals(self.pp.out.getvalue(), 'a\nb\nc\n')
def __init__(self, outputFormat = 'flat', useJarfileManifest = True, useChromeManifest = False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor() self.topsourcedir = None self.sourcedirs = [] self.localedirs = None self.l10nbase = None self.l10nmerge = None self.relativesrcdir = None self.rootManifestAppId = None
def trainClassifier(self, trainLetter, progress, progLab, maxSets): # nacitanie a predspracovanie signalu signalLoader = SignalLoader(self.chanNum,self.files) prpr = Preprocessor(self.chanNum,[]) signal,stimCode,phaseInSequence = signalLoader.loadSignal() self.signal = prpr.preprocess(240,1E-1,30E0,self.sf,signal,stimCode,phaseInSequence,0) self.stimulusCode = prpr.stimulusCode self.phaseInSequence = prpr.phaseInSequence self.targetLetters = sum(trainLetter,[]) # najdenie prechodov medzi znakmi charEnds = self.findCharEnds() # rozdelenie dat do epoch em = EpochManager(self.signal,self.stimulusCode,self.phaseInSequence) isiList = em.createEpochs() # trening jednotlivych znakov for i in range(len(charEnds)): progress["value"] = i progLab["text"] = ("Trénujem znak: {}/{}").format(i+1, len(charEnds)) print "Averaging character:",i,"\n" hi = charEnds[i] if i == 0: lo = 0 else: lo = charEnds[i-1] rowColBinList = em.getAveragedEpochs(hi,lo,isiList,maxSets) finalDataArray = rowColBinList classMarks = self.prepairTargetArray(self.getCharIndexes(self.targetLetters[i])) if self.firsttrain == 1: self.cl.learn(finalDataArray,classMarks,0) self.firsttrain = 0 else: self.cl.learn(finalDataArray,classMarks)
def preprocess(input, parser, defines={}): ''' Preprocess the file-like input with the given defines, and send the preprocessed output line by line to the given parser. ''' pp = Preprocessor() pp.context.update(defines) pp.do_filter('substitution') pp.out = PreprocessorOutputWrapper(pp, parser) pp.do_include(input)
def make_preprocessor(config_status): pp = Preprocessor() pp.setLineEndings("lf") pp.setMarker("#") pp.do_filter("substitution") # Might need 'substs' too. defines = {} for k, v in config_status['defines']: if v: defines[k] = v pp.context.update(defines) return pp
def getXy(self, path): raw = self.getRaw(path) docs = self.getDocs(raw['boilerplate']) if 'label' in raw: y = raw['label'] docs = self.preprocessDocs(docs) #docs = self.expandVocab(docs) print "vectorizing..." X_text = self.vectorizer.transform(docs) X_text = self.tsvd.transform(X_text) print 'X Sparse Array Size:' print X_text.shape self.Pre = Preprocessor() X_meta, y, urlid = self.Pre.preprocess(raw) #X_meta = np.abs(X_meta) #X = hstack([X_meta,X_text]) X = X_text d = {'X': X, 'y':y, 'urlid': urlid} return d
def _get_preprocessor(self, path, extra): '''Returns a preprocessor for use by create_config_file and create_makefile. ''' pp = Preprocessor() pp.context.update(self.substs) pp.context.update(top_srcdir = self.get_top_srcdir(path)) pp.context.update(srcdir = self.get_file_srcdir(path)) pp.context.update(relativesrcdir = self.get_relative_srcdir(path)) pp.context.update(DEPTH = self.get_depth(path)) if extra: pp.context.update(extra) pp.do_filter('attemptSubstitution') pp.setMarker(None) pp.out = FileAvoidWrite(path) return pp
class JarMaker(object): '''JarMaker reads jar.mn files and process those into jar files or flat directories, along with chrome.manifest files. ''' ignore = re.compile('\s*(\#.*)?$') jarline = re.compile('(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$') regline = re.compile('\%\s+(.*)$') entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+' entryline = re.compile(entryre + '(?P<output>[\w\d.\-\_\\\/\+]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/]+)\))?\s*$') def __init__(self, outputFormat = 'flat', useJarfileManifest = True, useChromeManifest = False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor() def getCommandLineParser(self): '''Get a optparse.OptionParser for jarmaker. This OptionParser has the options for jarmaker as well as the options for the inner PreProcessor. ''' # HACK, we need to unescape the string variables we get, # the perl versions didn't grok strings right p = self.pp.getCommandLineParser(unescapeDefines = True) p.add_option('-f', type="choice", default="jar", choices=('jar', 'flat', 'symlink'), help="fileformat used for output", metavar="[jar, flat, symlink]") p.add_option('-v', action="store_true", dest="verbose", help="verbose output") p.add_option('-q', action="store_false", dest="verbose", help="verbose output") p.add_option('-e', action="store_true", help="create chrome.manifest instead of jarfile.manifest") p.add_option('--both-manifests', action="store_true", dest="bothManifests", help="create chrome.manifest and jarfile.manifest") p.add_option('-s', type="string", action="append", default=[], help="source directory") p.add_option('-t', type="string", help="top source directory") p.add_option('-c', '--l10n-src', type="string", action="append", help="localization directory") p.add_option('--l10n-base', type="string", action="append", default=[], help="base directory to be used for localization (multiple)") p.add_option('-j', type="string", help="jarfile directory") # backwards compat, not needed p.add_option('-a', action="store_false", default=True, help="NOT SUPPORTED, turn auto-registration of chrome off (installed-chrome.txt)") p.add_option('-d', type="string", help="UNUSED, chrome directory") p.add_option('-o', help="cross compile for auto-registration, ignored") p.add_option('-l', action="store_true", help="ignored (used to switch off locks)") p.add_option('-x', action="store_true", help="force Unix") p.add_option('-z', help="backwards compat, ignored") p.add_option('-p', help="backwards compat, ignored") return p def processIncludes(self, includes): '''Process given includes with the inner PreProcessor. Only use this for #defines, the includes shouldn't generate content. ''' self.pp.out = StringIO() for inc in includes: self.pp.do_include(inc) includesvalue = self.pp.out.getvalue() if includesvalue: logging.info("WARNING: Includes produce non-empty output") self.pp.out = None pass def finalizeJar(self, jarPath, chromebasepath, register, doZip=True): '''Helper method to write out the chrome registration entries to jarfile.manifest or chrome.manifest, or both. The actual file processing is done in updateManifest. ''' # rewrite the manifest, if entries given if not register: return chromeManifest = os.path.join(os.path.dirname(jarPath), '..', 'chrome.manifest') if self.useJarfileManifest: self.updateManifest(jarPath + '.manifest', chromebasepath % '', register) addEntriesToListFile(chromeManifest, ['manifest chrome/%s.manifest' % (os.path.basename(jarPath),)]) if self.useChromeManifest: self.updateManifest(chromeManifest, chromebasepath % 'chrome/', register) def updateManifest(self, manifestPath, chromebasepath, register): '''updateManifest replaces the % in the chrome registration entries with the given chrome base path, and updates the given manifest file. ''' lock = lockFile(manifestPath + '.lck') try: myregister = dict.fromkeys(map(lambda s: s.replace('%', chromebasepath), register.iterkeys())) manifestExists = os.path.isfile(manifestPath) mode = (manifestExists and 'r+b') or 'wb' mf = open(manifestPath, mode) if manifestExists: # import previous content into hash, ignoring empty ones and comments imf = re.compile('(#.*)?$') for l in re.split('[\r\n]+', mf.read()): if imf.match(l): continue myregister[l] = None mf.seek(0) for k in myregister.iterkeys(): mf.write(k + os.linesep) mf.close() finally: lock = None def makeJar(self, infile=None, jardir='', sourcedirs=[], topsourcedir='', localedirs=None): '''makeJar is the main entry point to JarMaker. It takes the input file, the output directory, the source dirs and the top source dir as argument, and optionally the l10n dirs. ''' if isinstance(infile, basestring): logging.info("processing " + infile) pp = self.pp.clone() pp.out = StringIO() pp.do_include(infile) lines = pushback_iter(pp.out.getvalue().splitlines()) try: while True: l = lines.next() m = self.jarline.match(l) if not m: raise RuntimeError(l) if m.group('jarfile') is None: # comment continue self.processJarSection(m.group('jarfile'), lines, jardir, sourcedirs, topsourcedir, localedirs) except StopIteration: # we read the file pass return def makeJars(self, infiles, l10nbases, jardir='', sourcedirs=[], topsourcedir='', localedirs=None): '''makeJars is the second main entry point to JarMaker. It takes an iterable sequence of input file names, the l10nbases, the output directory, the source dirs and the top source dir as argument, and optionally the l10n dirs. It iterates over all inputs, guesses srcdir and l10ndir from the path and topsourcedir and calls into makeJar. The l10ndirs are created by guessing the relativesrcdir, and resolving that against the l10nbases. l10nbases can either be path strings, or callables. In the latter case, that will be called with the relativesrcdir as argument, and is expected to return a path string. This logic is disabled if the jar.mn path is not inside the topsrcdir. ''' topsourcedir = os.path.normpath(os.path.abspath(topsourcedir)) def resolveL10nBase(relpath): def _resolve(base): if isinstance(base, basestring): return os.path.join(base, relpath) if callable(base): return base(relpath) return base return _resolve for infile in infiles: srcdir = os.path.normpath(os.path.abspath(os.path.dirname(infile))) l10ndir = srcdir if os.path.basename(srcdir) == 'locales': l10ndir = os.path.dirname(l10ndir) l10ndirs = None # srcdir may not be a child of topsourcedir, in which case # we assume that the caller passed in suitable sourcedirs, # and just skip passing in localedirs if srcdir.startswith(topsourcedir): rell10ndir = l10ndir[len(topsourcedir):].lstrip(os.sep) l10ndirs = map(resolveL10nBase(rell10ndir), l10nbases) if localedirs is not None: l10ndirs += [os.path.normpath(os.path.abspath(s)) for s in localedirs] srcdirs = [os.path.normpath(os.path.abspath(s)) for s in sourcedirs] + [srcdir] self.makeJar(infile=infile, sourcedirs=srcdirs, topsourcedir=topsourcedir, localedirs=l10ndirs, jardir=jardir) def processJarSection(self, jarfile, lines, jardir, sourcedirs, topsourcedir, localedirs): '''Internal method called by makeJar to actually process a section of a jar.mn file. jarfile is the basename of the jarfile or the directory name for flat output, lines is a pushback_iterator of the lines of jar.mn, the remaining options are carried over from makeJar. ''' # chromebasepath is used for chrome registration manifests # %s is getting replaced with chrome/ for chrome.manifest, and with # an empty string for jarfile.manifest chromebasepath = '%s' + os.path.basename(jarfile) if self.outputFormat == 'jar': chromebasepath = 'jar:' + chromebasepath + '.jar!' chromebasepath += '/' jarfile = os.path.join(jardir, jarfile) jf = None if self.outputFormat == 'jar': #jar jarfilepath = jarfile + '.jar' try: os.makedirs(os.path.dirname(jarfilepath)) except OSError, error: if error.errno != errno.EEXIST: raise jf = ZipFile(jarfilepath, 'a', lock = True) outHelper = self.OutputHelper_jar(jf) else:
def setUp(self): cur_dir = os.path.dirname(os.path.realpath(__file__)) self.tpl_dir = os.path.join(cur_dir,'..','UnitTests','test_templates') self.preprocessor = Preprocessor(self.tpl_dir)
def guessChars(self,subset,files,targetLetter,testProgress,progTestLabel,guessView,guessLab,maxSets): aktCharNum = 0 totalChars = len(sum(targetLetter,[])) if self.chanNum != 64: files.sort() files = self.createTriplets(files) for m in range(len(files)): # nacitanie a predspracovanie signalu signalLoader = SignalLoader(self.chanNum,files[m]) prpr = Preprocessor(self.chanNum,subset) signal, stimCode, phaseInSequence = signalLoader.loadSignal() self.signal = prpr.preprocess(240,1E-1,30E0,self.sf,signal,stimCode,phaseInSequence,1) self.stimulusCode = prpr.stimulusCode self.phaseInSequence = prpr.phaseInSequence if (len(targetLetter) > m): self.targetLetters = targetLetter[m] else: self.targetLetters = [] print "Processing file:",m,"\n" # najdenie prechodov medzi znakmi charEnds = self.findCharEnds() # rozdelenie dat do epoch em = EpochManager(self.signal,self.stimulusCode,self.phaseInSequence) isiList = em.createEpochs() hit = 0 # hadanie jednotlivych znakov for i in range(len(charEnds)): testProgress["value"] = aktCharNum progTestLabel["text"] = ("Hádam znak: {}/{}").format(aktCharNum+1, totalChars) aktCharNum +=1 hi = charEnds[i] if i == 0: lo = 0 else: lo = charEnds[i-1] rowColBinList = em.getAveragedEpochs(hi,lo,isiList,maxSets) finalDataArray = self.prepairSignalArray(self.sf.grandAveragingFilter(rowColBinList,subset,1)) #pomocou klasifikatora char = self.cl.predictTarget(finalDataArray,self.cl.reduce(self.sf,self,subset)) if len(self.targetLetters) > i: if char == self.targetLetters[i]: hit+=1 print "Succesfully guessed char:",char,"\n" else: print "Guessed char:",char,"\n" if i == 0: text = "(" + char + "," elif i == len(charEnds) - 1: text = char + ")" else: text = char + "," guessView.configure(state='normal') guessView.insert(INSERT, text) guessView.configure(state='disabled') self.rate += (hit)*100/float(totalChars) print "\n Success rate= ",self.rate, "\n" guessLab["text"]=("Presnosť: {}").format(self.rate) return self.rate
class JarMaker(object): '''JarMaker reads jar.mn files and process those into jar files or flat directories, along with chrome.manifest files. ''' ignore = re.compile('\s*(\#.*)?$') jarline = re.compile('(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$') relsrcline = re.compile('relativesrcdir\s+(?P<relativesrcdir>.+?):') regline = re.compile('\%\s+(.*)$') entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+' entryline = re.compile(entryre + '(?P<output>[\w\d.\-\_\\\/\+\@]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/\@]+)\))?\s*$') def __init__(self, outputFormat = 'flat', useJarfileManifest = True, useChromeManifest = False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor() self.topsourcedir = None self.sourcedirs = [] self.localedirs = None self.l10nbase = None self.l10nmerge = None self.relativesrcdir = None self.rootManifestAppId = None def getCommandLineParser(self): '''Get a optparse.OptionParser for jarmaker. This OptionParser has the options for jarmaker as well as the options for the inner PreProcessor. ''' # HACK, we need to unescape the string variables we get, # the perl versions didn't grok strings right p = self.pp.getCommandLineParser(unescapeDefines = True) p.add_option('-f', type="choice", default="jar", choices=('jar', 'flat', 'symlink'), help="fileformat used for output", metavar="[jar, flat, symlink]") p.add_option('-v', action="store_true", dest="verbose", help="verbose output") p.add_option('-q', action="store_false", dest="verbose", help="verbose output") p.add_option('-e', action="store_true", help="create chrome.manifest instead of jarfile.manifest") p.add_option('--both-manifests', action="store_true", dest="bothManifests", help="create chrome.manifest and jarfile.manifest") p.add_option('-s', type="string", action="append", default=[], help="source directory") p.add_option('-t', type="string", help="top source directory") p.add_option('-c', '--l10n-src', type="string", action="append", help="localization directory") p.add_option('--l10n-base', type="string", action="store", help="base directory to be used for localization (requires relativesrcdir)") p.add_option('--locale-mergedir', type="string", action="store", help="base directory to be used for l10n-merge (requires l10n-base and relativesrcdir)") p.add_option('--relativesrcdir', type="string", help="relativesrcdir to be used for localization") p.add_option('-j', type="string", help="jarfile directory") p.add_option('--root-manifest-entry-appid', type="string", help="add an app id specific root chrome manifest entry.") return p def processIncludes(self, includes): '''Process given includes with the inner PreProcessor. Only use this for #defines, the includes shouldn't generate content. ''' self.pp.out = StringIO() for inc in includes: self.pp.do_include(inc) includesvalue = self.pp.out.getvalue() if includesvalue: logging.info("WARNING: Includes produce non-empty output") self.pp.out = None pass def finalizeJar(self, jarPath, chromebasepath, register, doZip=True): '''Helper method to write out the chrome registration entries to jarfile.manifest or chrome.manifest, or both. The actual file processing is done in updateManifest. ''' # rewrite the manifest, if entries given if not register: return chromeManifest = os.path.join(os.path.dirname(jarPath), '..', 'chrome.manifest') if self.useJarfileManifest: self.updateManifest(jarPath + '.manifest', chromebasepath.format(''), register) addEntriesToListFile(chromeManifest, ['manifest chrome/{0}.manifest' .format(os.path.basename(jarPath))]) if self.useChromeManifest: self.updateManifest(chromeManifest, chromebasepath.format('chrome/'), register) # If requested, add a root chrome manifest entry (assumed to be in the parent directory # of chromeManifest) with the application specific id. In cases where we're building # lang packs, the root manifest must know about application sub directories. if self.rootManifestAppId: rootChromeManifest = os.path.join(os.path.normpath(os.path.dirname(chromeManifest)), '..', 'chrome.manifest') rootChromeManifest = os.path.normpath(rootChromeManifest) chromeDir = os.path.basename(os.path.dirname(os.path.normpath(chromeManifest))) logging.info("adding '%s' entry to root chrome manifest appid=%s" % (chromeDir, self.rootManifestAppId)) addEntriesToListFile(rootChromeManifest, ['manifest %s/chrome.manifest application=%s' % (chromeDir, self.rootManifestAppId)]) def updateManifest(self, manifestPath, chromebasepath, register): '''updateManifest replaces the % in the chrome registration entries with the given chrome base path, and updates the given manifest file. ''' lock = lockFile(manifestPath + '.lck') try: myregister = dict.fromkeys(map(lambda s: s.replace('%', chromebasepath), register.iterkeys())) manifestExists = os.path.isfile(manifestPath) mode = (manifestExists and 'r+b') or 'wb' mf = open(manifestPath, mode) if manifestExists: # import previous content into hash, ignoring empty ones and comments imf = re.compile('(#.*)?$') for l in re.split('[\r\n]+', mf.read()): if imf.match(l): continue myregister[l] = None mf.seek(0) for k in myregister.iterkeys(): mf.write(k + os.linesep) mf.close() finally: lock = None def makeJar(self, infile, jardir): '''makeJar is the main entry point to JarMaker. It takes the input file, the output directory, the source dirs and the top source dir as argument, and optionally the l10n dirs. ''' # making paths absolute, guess srcdir if file and add to sourcedirs _normpath = lambda p: os.path.normpath(os.path.abspath(p)) self.topsourcedir = _normpath(self.topsourcedir) self.sourcedirs = [_normpath(p) for p in self.sourcedirs] if self.localedirs: self.localedirs = [_normpath(p) for p in self.localedirs] elif self.relativesrcdir: self.localedirs = self.generateLocaleDirs(self.relativesrcdir) if isinstance(infile, basestring): logging.info("processing " + infile) self.sourcedirs.append(_normpath(os.path.dirname(infile))) pp = self.pp.clone() pp.out = StringIO() pp.do_include(infile) lines = pushback_iter(pp.out.getvalue().splitlines()) try: while True: l = lines.next() m = self.jarline.match(l) if not m: raise RuntimeError(l) if m.group('jarfile') is None: # comment continue self.processJarSection(m.group('jarfile'), lines, jardir) except StopIteration: # we read the file pass return def generateLocaleDirs(self, relativesrcdir): if os.path.basename(relativesrcdir) == 'locales': # strip locales l10nrelsrcdir = os.path.dirname(relativesrcdir) else: l10nrelsrcdir = relativesrcdir locdirs = [] # generate locales dirs, merge, l10nbase, en-US if self.l10nmerge: locdirs.append(os.path.join(self.l10nmerge, l10nrelsrcdir)) if self.l10nbase: locdirs.append(os.path.join(self.l10nbase, l10nrelsrcdir)) if self.l10nmerge or not self.l10nbase: # add en-US if we merge, or if it's not l10n locdirs.append(os.path.join(self.topsourcedir, relativesrcdir, 'en-US')) return locdirs def processJarSection(self, jarfile, lines, jardir): '''Internal method called by makeJar to actually process a section of a jar.mn file. jarfile is the basename of the jarfile or the directory name for flat output, lines is a pushback_iterator of the lines of jar.mn, the remaining options are carried over from makeJar. ''' # chromebasepath is used for chrome registration manifests # {0} is getting replaced with chrome/ for chrome.manifest, and with # an empty string for jarfile.manifest chromebasepath = '{0}' + os.path.basename(jarfile) if self.outputFormat == 'jar': chromebasepath = 'jar:' + chromebasepath + '.jar!' chromebasepath += '/' jarfile = os.path.join(jardir, jarfile) jf = None if self.outputFormat == 'jar': #jar jarfilepath = jarfile + '.jar' try: os.makedirs(os.path.dirname(jarfilepath)) except OSError as error: if error.errno != errno.EEXIST: raise jf = ZipFile(jarfilepath, 'a', lock = True) outHelper = self.OutputHelper_jar(jf) else: outHelper = getattr(self, 'OutputHelper_' + self.outputFormat)(jarfile) register = {} # This loop exits on either # - the end of the jar.mn file # - an line in the jar.mn file that's not part of a jar section # - on an exception raised, close the jf in that case in a finally try: while True: try: l = lines.next() except StopIteration: # we're done with this jar.mn, and this jar section self.finalizeJar(jarfile, chromebasepath, register) if jf is not None: jf.close() # reraise the StopIteration for makeJar raise if self.ignore.match(l): continue m = self.relsrcline.match(l) if m: relativesrcdir = m.group('relativesrcdir') self.localedirs = self.generateLocaleDirs(relativesrcdir) continue m = self.regline.match(l) if m: rline = m.group(1) register[rline] = 1 continue m = self.entryline.match(l) if not m: # neither an entry line nor chrome reg, this jar section is done self.finalizeJar(jarfile, chromebasepath, register) if jf is not None: jf.close() lines.pushback(l) return self._processEntryLine(m, outHelper, jf) finally: if jf is not None: jf.close() return def _processEntryLine(self, m, outHelper, jf): out = m.group('output') src = m.group('source') or os.path.basename(out) # pick the right sourcedir -- l10n, topsrc or src if m.group('locale'): src_base = self.localedirs elif src.startswith('/'): # path/in/jar/file_name.xul (/path/in/sourcetree/file_name.xul) # refers to a path relative to topsourcedir, use that as base # and strip the leading '/' src_base = [self.topsourcedir] src = src[1:] else: # use srcdirs and the objdir (current working dir) for relative paths src_base = self.sourcedirs + [os.getcwd()] # check if the source file exists realsrc = None for _srcdir in src_base: if os.path.isfile(os.path.join(_srcdir, src)): realsrc = os.path.join(_srcdir, src) break if realsrc is None: if jf is not None: jf.close() raise RuntimeError('File "{0}" not found in {1}' .format(src, ', '.join(src_base))) if m.group('optPreprocess'): outf = outHelper.getOutput(out) inf = open(realsrc) pp = self.pp.clone() if src[-4:] == '.css': pp.setMarker('%') pp.out = outf pp.do_include(inf) pp.warnUnused(realsrc) outf.close() inf.close() return # copy or symlink if newer or overwrite if (m.group('optOverwrite') or (getModTime(realsrc) > outHelper.getDestModTime(m.group('output')))): if self.outputFormat == 'symlink': outHelper.symlink(realsrc, out) return outf = outHelper.getOutput(out) # open in binary mode, this can be images etc inf = open(realsrc, 'rb') outf.write(inf.read()) outf.close() inf.close() class OutputHelper_jar(object): '''Provide getDestModTime and getOutput for a given jarfile. ''' def __init__(self, jarfile): self.jarfile = jarfile def getDestModTime(self, aPath): try : info = self.jarfile.getinfo(aPath) return info.date_time except: return 0 def getOutput(self, name): return ZipEntry(name, self.jarfile) class OutputHelper_flat(object): '''Provide getDestModTime and getOutput for a given flat output directory. The helper method ensureDirFor is used by the symlink subclass. ''' def __init__(self, basepath): self.basepath = basepath def getDestModTime(self, aPath): return getModTime(os.path.join(self.basepath, aPath)) def getOutput(self, name): out = self.ensureDirFor(name) # remove previous link or file try: os.remove(out) except OSError as e: if e.errno != errno.ENOENT: raise return open(out, 'wb') def ensureDirFor(self, name): out = os.path.join(self.basepath, name) outdir = os.path.dirname(out) if not os.path.isdir(outdir): try: os.makedirs(outdir) except OSError as error: if error.errno != errno.EEXIST: raise return out class OutputHelper_symlink(OutputHelper_flat): '''Subclass of OutputHelper_flat that provides a helper for creating a symlink including creating the parent directories. ''' def symlink(self, src, dest): out = self.ensureDirFor(dest) # remove previous link or file try: os.remove(out) except OSError as e: if e.errno != errno.ENOENT: raise if sys.platform != "win32": os.symlink(src, out) else: # On Win32, use ctypes to create a hardlink rv = CreateHardLink(out, src, None) if rv == 0: raise WinError()
class TopicModelHarness: def __init__(self, getTitle, getBody, getUrl): self.getTitle = getTitle self.getBody = getBody self.getUrl = getUrl def getRaw(self, path): raw = read_csv(path, sep='\t', na_values=['?']).fillna(-5) return raw def getColumns(self, raw): boilerplate = raw['boilerplate'] urlid = raw['urlid'] if 'label' in raw: labels = raw['label'] return boilerplate, labels, urlid return boilerplate, urlid def getDocs(self, boilerplate): docs = [] for row in boilerplate: rowObject = json.loads(row) doc = '' if 'title' in rowObject and rowObject['title'] and self.getTitle: doc += rowObject['title'] if 'body' in rowObject and rowObject['body'] and self.getBody: doc += ' ' + rowObject['body'] if 'url' in rowObject and rowObject['url'] and self.getUrl: doc += ' ' + rowObject['url'] docs.append(doc) return docs def tag(self, str, tag): strList = str.split(' ') newstr = '' for s in strList: if s.lower() not in ENGLISH_STOP_WORDS: newstr += tag + '__' + s + ' ' + s + ' ' return newstr def preprocessDocs(self, docs): preprocessed_docs = [] for doc in docs: punctuation = [',','.', ';', '!', '?', ':'] for p in punctuation: doc = doc.replace(p, ' ' + p + ' ') doc = doc.lower() preprocessed_docs.append(doc) return preprocessed_docs def expandVocab(self, docs): print 'expanding vocabulary...' freqCounts = self.countTokens(docs) tokenList = [] freqCountList = [] for token in freqCounts: tokenList.append(token) freqCountList.append(freqCounts[token]) expTokenDf = DataFrame({'tokens': tokenList, 'freqCounts': freqCountList}) expTokenDf = expTokenDf.sort('freqCounts', ascending=False) expandableTokensFiltered = set(expTokenDf['tokens'][2000:3000]).difference(ENGLISH_STOP_WORDS) batchSize = 10000 print "%d filtered tokens chosen" % len(expandableTokensFiltered) print "Expandable tokens: " print expandableTokensFiltered newDocs = [] for i in xrange(0,len(docs)): doc = docs[i] newDocSplit = doc.split() tokenList = doc.split(' ') start = 0 newTokens = set() while start < len(tokenList): stop = start + batchSize tokens = set(tokenList[start:stop]) start = start + batchSize/2 tokensToExpand = tokens.intersection(expandableTokensFiltered) newTokens = newTokens.union(self.expandVocabFromSet(tokensToExpand)) newDocSplit.extend(list(newTokens)) newDoc = '' for token in newDocSplit: newDoc += ' ' + token + ' ' newDocs.append(newDoc) if i % 500 == 0: print '\nprocessed %d docs' % i print '%d new tokens added to document' % len(newTokens) print 'new tokens:' print newTokens print len(tokens) return newDocs def expandVocabFromSet(self, tokensToExpand): expanded = set() for token1 in tokensToExpand: for token2 in tokensToExpand: if token1 != token2: hash = self.getTwoTokenHash(token1, token2) if hash not in expanded: expanded.add(hash) return expanded def getTwoTokenHash(self, token1, token2): l = [token1, token2] l.sort() hash = l[0] + '___' + l[1] return hash def replaceRareWords(self, docs, rareWords): processed_docs = [] for doc in docs: punctuation = [',','.', ';', '!', '?', ':'] for p in punctuation: doc = doc.replace(p, ' ' + p + ' ') docList = [(self.classifyRareWord(d) if (d in rareWords) else d) for d in doc.split(' ')] doc = '' for d in docList: doc += ' %s ' % d processed_docs.append(doc) return processed_docs def removeStopWords(self, docs, stopWords): processed_docs = [] for doc in docs: punctuation = [',','.', ';', '!', '?', ':'] for p in punctuation: doc = doc.replace(p, ' ' + p + ' ') docList = [('' if (d in stopWords) else d) for d in doc.split(' ')] doc = '' for d in docList: doc += ' %s ' % d processed_docs.append(doc) return processed_docs def getStopWords(self, freqs, threshold): stopWords = set() for token in freqs: if freqs[token] > threshold: stopWords.add(token) return stopWords def classifyRareWord(self, word): if word.find('-') >= 0: words = word.split('-') out = '' for w in words: out += w + ' ' return out if word.isdigit(): return '__ISDIGIT__' return '__RARE__' def countTokens(self, docs): freqCounts = {} #for doc in docs, yi in y: for i in xrange(0,len(docs)): doc = docs[i] tokenList = doc.split(' ') for token in tokenList: if token in freqCounts: freqCounts[token] += 1 else: freqCounts[token] = 1 return freqCounts def getAmbiguousTokens(self, freqsByClass): ambiguousTokens = set() for token in freqsByClass: if np.abs(freqsByClass[token]) < 0.1: ambiguousTokens.add(token) return ambiguousTokens def getFreqs(self, freqCounts): # get total token count totalTokenCount = 0 for token in freqCounts: totalTokenCount += freqCounts[token] freqs = {} for token in freqCounts: freqs[token] = float(freqCounts[token]) / float(totalTokenCount) return freqs def getRareWords(self, freqCounts): #freqCounts = self.countTokens(docs) rareWords = set() for token in freqCounts: if freqCounts[token] <= 1: rareWords.add(token) return rareWords def vectorize(self, docs, stopWords, fit=False): print "vectorizing..." #vectorizer = HashingVectorizer(stop_words='english', non_negative=True) #vectorizer = HashingVectorizer(stop_words=stopWords, non_negative=True, norm='l2') vectorizedDocs = self.vectorizer.transform(docs) #print vectorizedDocs return vectorizedDocs def standardizeVecs(self, vectorizedDocs): print "standardizing vectors..." s = vectorizedDocs #s_lil = vectorizedDocs.tolil() """ col_sum = s.sum(axis=0) (rows, cols) = s.nonzero() s_normalized = lil_matrix(s.shape, dtype='float64') for i in xrange(0,len(rows)): s_normalized[rows[i], cols[i]] = s[rows[i], cols[i]] / col_sum[0, cols[i]] if i%50000==0: print i return s_normalized.tocsr() """ # standardize #means = s.mean(axis=0) # initialize mean matrix #mean_lil = lil_matrix(s.shape, dtype='float64') (rows, cols) = s.nonzero() #for i in xrange(0, len(rows)): # mean_lil[rows[i], cols[i]] = means[0, cols[i]] #mean_csr = mean_lil.tocsr() #s_zeroMean = (s - mean_csr) #s_stdDev = (s_zeroMean.multiply(s_zeroMean)).mean(axis=0) norm = (s.multiply(s)).sum(axis=0) s_standardized = lil_matrix(s.shape, dtype='float64') #print s_stdDev.shape print s_standardized.shape #print s_zeroMean.shape for i in xrange(0,len(rows)): #s_standardized[rows[i], cols[i]] = s_zeroMean[rows[i], cols[i]] / s_stdDev[0, cols[i]] s_standardized[rows[i], cols[i]] = float(np.abs(s[rows[i], cols[i]])) / np.sqrt(float(norm[0, cols[i]])) if i%50000==0: print i for i in xrange(0,50): print s_standardized[i, 0] return s_standardized.tocsr() def trainFromDataFrame(self, df): print "training from data frame..." boilerplate, y = self.getColumns(df) docs = self.getDocs(boilerplate) #rareWords = self.getRareWords(docs) docs = self.preprocessDocs(docs) X = self.vectorize(docs) self.fit(X, y) def fit(self, X, y): print 'training topic model...' #self.model = TopicModel() #self.model = LogisticRegression(penalty='l2', dual=True, C=.8) #self.model.fit(X,y) #params = {'C': linspace(.3, .8, 1), 'numFeatures': linspace(1000, 15000, 5)} #params = {'C': linspace(.5, 1, 2), 'numFeatures': linspace(25000, 75000, 2)} #print params #params = {'C': logspace(-1,4,10), 'gamma':logspace(0,0,1)} params = {'C': linspace(.2,2,10)} clf = TopicModel() self.model = GridSearchCV(clf, param_grid=params, scoring='roc_auc', cv=5, verbose=2, n_jobs=4) self.model.fit(X, y) try: print 'Best Params:' print self.model.best_params_ print 'Best Score: ' print self.model.best_score_ print self.model.grid_scores_ except: pass self.model = self.model.best_estimator_ def addTopicModel(self, boilerplate): docs = self.getDocs(boilerplate) X_extracted = self.vectorize(docs) #y_predicted = self.predict(X_extracted) y_predicted = self.binResults(self.predict(X_extracted), .05) return y_predicted def addTotalWordCounts(self, boilerplate): docs = self.getDocs(boilerplate) wordCounts = [(len(doc.split(' '))) for doc in docs] return wordCounts def predict(self, X): return self.model.predict(X) #return self.model.predict_proba(X)[:,1] def addAlchemyCategories(self, docs, alchemyCategory): for i in xrange(0, len(docs)): docs[i] = docs[i] + ' __' + str(alchemyCategory[i]) + ' ' return docs def trainVectorizer(self): print "Training vectorizer..." raw = self.getRaw('train.tsv') rawTest = self.getRaw('test.tsv') boilerplate = list(raw['boilerplate']) boilerplate.extend(list(rawTest['boilerplate'])) docs = self.getDocs(boilerplate) docs = self.preprocessDocs(docs) #docs = self.expandVocab(docs) print 'all docs length: %d' % len(docs) self.vectorizer =TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1, sublinear_tf=1) self.vectorizer.fit(docs) print "vectorizing..." X = self.vectorizer.transform(docs) print "finding principal components..." self.tsvd = TruncatedSVD(n_components = 500) self.tsvd.fit(X) def getXy(self, path): raw = self.getRaw(path) docs = self.getDocs(raw['boilerplate']) if 'label' in raw: y = raw['label'] docs = self.preprocessDocs(docs) #docs = self.expandVocab(docs) print "vectorizing..." X_text = self.vectorizer.transform(docs) X_text = self.tsvd.transform(X_text) print 'X Sparse Array Size:' print X_text.shape self.Pre = Preprocessor() X_meta, y, urlid = self.Pre.preprocess(raw) #X_meta = np.abs(X_meta) #X = hstack([X_meta,X_text]) X = X_text d = {'X': X, 'y':y, 'urlid': urlid} return d def runModel(self, testSize, debug): self.trainVectorizer() d = self.getXy('train.tsv') if debug: X_train, X_test, y_train, y_test = train_test_split(d['X'], d['y'], test_size=testSize, random_state=5) else: X_train = d['X'] y_train = d['y'] d_test = self.getXy('test.tsv') X_test = d_test['X'] urlid = d_test['urlid'] self.fit(X_train, y_train) print "20 Fold CV Score: ", np.mean(cross_val_score(self.model, d['X'], d['y'], cv=10, scoring='roc_auc')) y_predicted = self.predict(X_test) if debug: print 'Topic Model AUC Score: %f' % roc_auc_score(y_test, y_predicted) else: Pre = Preprocessor() Pre.generateSubmission('submission_12.csv', urlid, y_predicted) P.figure() P.hist(y_predicted, bins=100) P.show()
def setUp(self): self.pp = Preprocessor() self.pp.out = StringIO()
class TestPreprocessor(unittest.TestCase): """ Unit tests for the Context class """ def setUp(self): self.pp = Preprocessor() self.pp.out = StringIO() def test_conditional_if_0(self): f = NamedIO("conditional_if_0.in", """#if 0 FAIL #else PASS #endif """) self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_string_value(self): f = NamedIO("string_value.in", """#define FOO STRING #if FOO string value is true #else string value is false #endif """) self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "string value is false\n") def test_number_value(self): f = NamedIO("string_value.in", """#define FOO 1 #if FOO number value is true #else number value is false #endif """) self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "number value is true\n") def test_conditional_if_0_elif_1(self): f = NamedIO('conditional_if_0_elif_1.in', '''#if 0 #elif 1 PASS #else FAIL #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_conditional_if_1(self): f = NamedIO('conditional_if_1.in', '''#if 1 PASS #else FAILE #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_conditional_if_1_elif_1_else(self): f = NamedIO('conditional_if_1_elif_1_else.in', '''#if 1 PASS #elif 1 FAIL #else FAIL #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_conditional_if_1_if_1(self): f = NamedIO('conditional_if_1_if_1.in', '''#if 1 #if 1 PASS #else FAIL #endif #else FAIL #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_conditional_not_0(self): f = NamedIO('conditional_not_0.in', '''#if !0 PASS #else FAIL #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_conditional_not_1(self): f = NamedIO('conditional_not_1.in', '''#if !1 FAIL #else PASS #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_conditional_not_emptyval(self): f = NamedIO('conditional_not_emptyval.in', '''#define EMPTYVAL #if !EMPTYVAL FAIL #else PASS #endif #if EMPTYVAL PASS #else FAIL #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\nPASS\n") def test_conditional_not_nullval(self): f = NamedIO('conditional_not_nullval.in', '''#define NULLVAL 0 #if !NULLVAL PASS #else FAIL #endif ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_expand(self): f = NamedIO('expand.in', '''#define ASVAR AS #expand P__ASVAR__S ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_undef_defined(self): f = NamedIO('undef_defined.in', '''#define BAR #undef BAR BAR ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "BAR\n") def test_undef_undefined(self): f = NamedIO('undef_undefined.in', '''#undef VAR ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "") def test_filter_attemptSubstitution(self): f = NamedIO('filter_attemptSubstitution.in', '''#filter attemptSubstitution P@VAR@ASS #unfilter attemptSubstitution ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_filter_emptyLines(self): f = NamedIO('filter_emptyLines.in', '''lines with a blank line #filter emptyLines lines with no blank lines #unfilter emptyLines yet more lines with blank lines ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), '''lines with a blank line lines with no blank lines yet more lines with blank lines ''') def test_filter_slashslash(self): f = NamedIO('filter_slashslash.in', '''#filter slashslash PASS//FAIL // FAIL #unfilter slashslash PASS // PASS ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\nPASS // PASS\n") def test_filter_spaces(self): f = NamedIO('filter_spaces.in', '''#filter spaces You should see two nice ascii tables +-+-+-+ | | | | +-+-+-+ #unfilter spaces +-+---+ | | | +-+---+ ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), """You should see two nice ascii tables +-+-+-+ | | | | +-+-+-+ +-+---+ | | | +-+---+ """) def test_filter_substitution(self): f = NamedIO('filter_substitution.in', '''#define VAR ASS #filter substitution P@VAR@ #unfilter substitution ''') self.pp.do_include(f) self.assertEqual(self.pp.out.getvalue(), "PASS\n") def test_error(self): f = NamedIO('error.in', '''#error spit this message out ''') caught_msg = None try: self.pp.do_include(f) except Preprocessor.Error, e: caught_msg = e.args[0][-1] self.assertEqual(caught_msg, 'spit this message out')
def main(): pp = Preprocessor() tdpath = 'dataset/test/test-data-1' pp.process_test_data(tdpath)
def setUp(self): self.pp = Preprocessor() self.pp.out = StringIO() self.tempnam = os.tempnam('.')
class JarMaker(object): '''JarMaker reads jar.mn files and process those into jar files or flat directories, along with chrome.manifest files. ''' ignore = re.compile('\s*(\#.*)?$') jarline = re.compile('(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$') relsrcline = re.compile('relativesrcdir\s+(?P<relativesrcdir>.+?):') regline = re.compile('\%\s+(.*)$') entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+' entryline = re.compile(entryre + '(?P<output>[\w\d.\-\_\\\/\+\@]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/\@]+)\))?\s*$') def __init__(self, outputFormat = 'flat', useJarfileManifest = True, useChromeManifest = False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor() self.topsourcedir = None self.sourcedirs = [] self.localedirs = None self.l10nbase = None self.l10nmerge = None self.relativesrcdir = None self.rootManifestAppId = None def getCommandLineParser(self): '''Get a optparse.OptionParser for jarmaker. This OptionParser has the options for jarmaker as well as the options for the inner PreProcessor. ''' # HACK, we need to unescape the string variables we get, # the perl versions didn't grok strings right p = self.pp.getCommandLineParser(unescapeDefines = True) p.add_option('-f', type="choice", default="jar", choices=('jar', 'flat', 'symlink'), help="fileformat used for output", metavar="[jar, flat, symlink]") p.add_option('-v', action="store_true", dest="verbose", help="verbose output") p.add_option('-q', action="store_false", dest="verbose", help="verbose output") p.add_option('-e', action="store_true", help="create chrome.manifest instead of jarfile.manifest") p.add_option('--both-manifests', action="store_true", dest="bothManifests", help="create chrome.manifest and jarfile.manifest") p.add_option('-s', type="string", action="append", default=[], help="source directory") p.add_option('-t', type="string", help="top source directory") p.add_option('-c', '--l10n-src', type="string", action="append", help="localization directory") p.add_option('--l10n-base', type="string", action="store", help="base directory to be used for localization (requires relativesrcdir)") p.add_option('--locale-mergedir', type="string", action="store", help="base directory to be used for l10n-merge (requires l10n-base and relativesrcdir)") p.add_option('--relativesrcdir', type="string", help="relativesrcdir to be used for localization") p.add_option('-j', type="string", help="jarfile directory") p.add_option('--root-manifest-entry-appid', type="string", help="add an app id specific root chrome manifest entry.") return p def processIncludes(self, includes): '''Process given includes with the inner PreProcessor. Only use this for #defines, the includes shouldn't generate content. ''' self.pp.out = StringIO() for inc in includes: self.pp.do_include(inc) includesvalue = self.pp.out.getvalue() if includesvalue: logging.info("WARNING: Includes produce non-empty output") self.pp.out = None pass def finalizeJar(self, jarPath, chromebasepath, register, doZip=True): '''Helper method to write out the chrome registration entries to jarfile.manifest or chrome.manifest, or both. The actual file processing is done in updateManifest. ''' # rewrite the manifest, if entries given if not register: return chromeManifest = os.path.join(os.path.dirname(jarPath), '..', 'chrome.manifest') if self.useJarfileManifest: self.updateManifest(jarPath + '.manifest', chromebasepath % '', register) addEntriesToListFile(chromeManifest, ['manifest chrome/%s.manifest' % (os.path.basename(jarPath),)]) if self.useChromeManifest: self.updateManifest(chromeManifest, chromebasepath % 'chrome/', register) # If requested, add a root chrome manifest entry (assumed to be in the parent directory # of chromeManifest) with the application specific id. In cases where we're building # lang packs, the root manifest must know about application sub directories. if self.rootManifestAppId: rootChromeManifest = os.path.join(os.path.normpath(os.path.dirname(chromeManifest)), '..', 'chrome.manifest') rootChromeManifest = os.path.normpath(rootChromeManifest) chromeDir = os.path.basename(os.path.dirname(os.path.normpath(chromeManifest))) logging.info("adding '%s' entry to root chrome manifest appid=%s" % (chromeDir, self.rootManifestAppId)) addEntriesToListFile(rootChromeManifest, ['manifest %s/chrome.manifest application=%s' % (chromeDir, self.rootManifestAppId)]) def updateManifest(self, manifestPath, chromebasepath, register): '''updateManifest replaces the % in the chrome registration entries with the given chrome base path, and updates the given manifest file. ''' lock = lockFile(manifestPath + '.lck') try: myregister = dict.fromkeys(map(lambda s: s.replace('%', chromebasepath), register.iterkeys())) manifestExists = os.path.isfile(manifestPath) mode = (manifestExists and 'r+b') or 'wb' mf = open(manifestPath, mode) if manifestExists: # import previous content into hash, ignoring empty ones and comments imf = re.compile('(#.*)?$') for l in re.split('[\r\n]+', mf.read()): if imf.match(l): continue myregister[l] = None mf.seek(0) for k in myregister.iterkeys(): mf.write(k + os.linesep) mf.close() finally: lock = None def makeJar(self, infile, jardir): '''makeJar is the main entry point to JarMaker. It takes the input file, the output directory, the source dirs and the top source dir as argument, and optionally the l10n dirs. ''' # making paths absolute, guess srcdir if file and add to sourcedirs _normpath = lambda p: os.path.normpath(os.path.abspath(p)) self.topsourcedir = _normpath(self.topsourcedir) self.sourcedirs = [_normpath(p) for p in self.sourcedirs] if self.localedirs: self.localedirs = [_normpath(p) for p in self.localedirs] elif self.relativesrcdir: self.localedirs = self.generateLocaleDirs(self.relativesrcdir) if isinstance(infile, basestring): logging.info("processing " + infile) self.sourcedirs.append(_normpath(os.path.dirname(infile))) pp = self.pp.clone() pp.out = StringIO() pp.do_include(infile) lines = pushback_iter(pp.out.getvalue().splitlines()) try: while True: l = lines.next() m = self.jarline.match(l) if not m: raise RuntimeError(l) if m.group('jarfile') is None: # comment continue self.processJarSection(m.group('jarfile'), lines, jardir) except StopIteration: # we read the file pass return def generateLocaleDirs(self, relativesrcdir): if os.path.basename(relativesrcdir) == 'locales': # strip locales l10nrelsrcdir = os.path.dirname(relativesrcdir) else: l10nrelsrcdir = relativesrcdir locdirs = [] # generate locales dirs, merge, l10nbase, en-US if self.l10nmerge: locdirs.append(os.path.join(self.l10nmerge, l10nrelsrcdir)) if self.l10nbase: locdirs.append(os.path.join(self.l10nbase, l10nrelsrcdir)) if self.l10nmerge or not self.l10nbase: # add en-US if we merge, or if it's not l10n locdirs.append(os.path.join(self.topsourcedir, relativesrcdir, 'en-US')) return locdirs def processJarSection(self, jarfile, lines, jardir): '''Internal method called by makeJar to actually process a section of a jar.mn file. jarfile is the basename of the jarfile or the directory name for flat output, lines is a pushback_iterator of the lines of jar.mn, the remaining options are carried over from makeJar. ''' # chromebasepath is used for chrome registration manifests # %s is getting replaced with chrome/ for chrome.manifest, and with # an empty string for jarfile.manifest chromebasepath = '%s' + os.path.basename(jarfile) if self.outputFormat == 'jar': chromebasepath = 'jar:' + chromebasepath + '.jar!' chromebasepath += '/' jarfile = os.path.join(jardir, jarfile) jf = None if self.outputFormat == 'jar': #jar jarfilepath = jarfile + '.jar' try: os.makedirs(os.path.dirname(jarfilepath)) except OSError, error: if error.errno != errno.EEXIST: raise jf = ZipFile(jarfilepath, 'a', lock = True) outHelper = self.OutputHelper_jar(jf) else:
def run(self): preprocessor = Preprocessor() preprocessor.load_encoder() classifier = Classifier(preprocessor) classifier.run()
class JarMaker(object): '''JarMaker reads jar.mn files and process those into jar files or flat directories, along with chrome.manifest files. ''' ignore = re.compile('\s*(\#.*)?$') jarline = re.compile( '(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$') regline = re.compile('\%\s+(.*)$') entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+' entryline = re.compile( entryre + '(?P<output>[\w\d.\-\_\\\/\+]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/]+)\))?\s*$' ) def __init__(self, outputFormat='flat', useJarfileManifest=True, useChromeManifest=False): self.outputFormat = outputFormat self.useJarfileManifest = useJarfileManifest self.useChromeManifest = useChromeManifest self.pp = Preprocessor() def getCommandLineParser(self): '''Get a optparse.OptionParser for jarmaker. This OptionParser has the options for jarmaker as well as the options for the inner PreProcessor. ''' # HACK, we need to unescape the string variables we get, # the perl versions didn't grok strings right p = self.pp.getCommandLineParser(unescapeDefines=True) p.add_option( '-f', type="choice", default="jar", choices=('jar', 'flat', 'symlink'), help="fileformat used for output", metavar="[jar, flat, symlink]") p.add_option( '-v', action="store_true", dest="verbose", help="verbose output") p.add_option( '-q', action="store_false", dest="verbose", help="verbose output") p.add_option( '-e', action="store_true", help="create chrome.manifest instead of jarfile.manifest") p.add_option( '--both-manifests', action="store_true", dest="bothManifests", help="create chrome.manifest and jarfile.manifest") p.add_option( '-s', type="string", action="append", default=[], help="source directory") p.add_option('-t', type="string", help="top source directory") p.add_option( '-c', '--l10n-src', type="string", action="append", help="localization directory") p.add_option( '--l10n-base', type="string", action="append", default=[], help="base directory to be used for localization (multiple)") p.add_option('-j', type="string", help="jarfile directory") # backwards compat, not needed p.add_option( '-a', action="store_false", default=True, help= "NOT SUPPORTED, turn auto-registration of chrome off (installed-chrome.txt)" ) p.add_option('-d', type="string", help="UNUSED, chrome directory") p.add_option('-o', help="cross compile for auto-registration, ignored") p.add_option( '-l', action="store_true", help="ignored (used to switch off locks)") p.add_option('-x', action="store_true", help="force Unix") p.add_option('-z', help="backwards compat, ignored") p.add_option('-p', help="backwards compat, ignored") return p def processIncludes(self, includes): '''Process given includes with the inner PreProcessor. Only use this for #defines, the includes shouldn't generate content. ''' self.pp.out = StringIO() for inc in includes: self.pp.do_include(inc) includesvalue = self.pp.out.getvalue() if includesvalue: logging.info("WARNING: Includes produce non-empty output") self.pp.out = None pass def finalizeJar(self, jarPath, chromebasepath, register, doZip=True): '''Helper method to write out the chrome registration entries to jarfile.manifest or chrome.manifest, or both. The actual file processing is done in updateManifest. ''' # rewrite the manifest, if entries given if not register: return if self.useJarfileManifest: self.updateManifest(jarPath + '.manifest', chromebasepath % '', register) if self.useChromeManifest: manifestPath = os.path.join( os.path.dirname(jarPath), '..', 'chrome.manifest') self.updateManifest(manifestPath, chromebasepath % 'chrome/', register) def updateManifest(self, manifestPath, chromebasepath, register): '''updateManifest replaces the % in the chrome registration entries with the given chrome base path, and updates the given manifest file. ''' myregister = dict.fromkeys( map(lambda s: s.replace('%', chromebasepath), register.iterkeys())) manifestExists = os.path.isfile(manifestPath) mode = (manifestExists and 'r+b') or 'wb' mf = open(manifestPath, mode) if manifestExists: # import previous content into hash, ignoring empty ones and comments imf = re.compile('(#.*)?$') for l in re.split('[\r\n]+', mf.read()): if imf.match(l): continue myregister[l] = None mf.seek(0) for k in myregister.iterkeys(): mf.write(k + os.linesep) mf.close() def makeJar(self, infile=None, jardir='', sourcedirs=[], topsourcedir='', localedirs=None): '''makeJar is the main entry point to JarMaker. It takes the input file, the output directory, the source dirs and the top source dir as argument, and optionally the l10n dirs. ''' if isinstance(infile, basestring): logging.info("processing " + infile) pp = self.pp.clone() pp.out = StringIO() pp.do_include(infile) lines = pushback_iter(pp.out.getvalue().splitlines()) try: while True: l = lines.next() m = self.jarline.match(l) if not m: raise RuntimeError(l) if m.group('jarfile') is None: # comment continue self.processJarSection( m.group('jarfile'), lines, jardir, sourcedirs, topsourcedir, localedirs) except StopIteration: # we read the file pass return def makeJars(self, infiles, l10nbases, jardir='', sourcedirs=[], topsourcedir='', localedirs=None): '''makeJars is the second main entry point to JarMaker. It takes an iterable sequence of input file names, the l10nbases, the output directory, the source dirs and the top source dir as argument, and optionally the l10n dirs. It iterates over all inputs, guesses srcdir and l10ndir from the path and topsourcedir and calls into makeJar. The l10ndirs are created by guessing the relativesrcdir, and resolving that against the l10nbases. l10nbases can either be path strings, or callables. In the latter case, that will be called with the relativesrcdir as argument, and is expected to return a path string. This logic is disabled if the jar.mn path is not inside the topsrcdir. ''' topsourcedir = os.path.normpath(os.path.abspath(topsourcedir)) def resolveL10nBase(relpath): def _resolve(base): if isinstance(base, basestring): return os.path.join(base, relpath) if callable(base): return base(relpath) return base return _resolve for infile in infiles: srcdir = os.path.normpath(os.path.abspath(os.path.dirname(infile))) l10ndir = srcdir if os.path.basename(srcdir) == 'locales': l10ndir = os.path.dirname(l10ndir) l10ndirs = None # srcdir may not be a child of topsourcedir, in which case # we assume that the caller passed in suitable sourcedirs, # and just skip passing in localedirs if srcdir.startswith(topsourcedir): rell10ndir = l10ndir[len(topsourcedir):].lstrip(os.sep) l10ndirs = map(resolveL10nBase(rell10ndir), l10nbases) if localedirs is not None: l10ndirs += [ os.path.normpath(os.path.abspath(s)) for s in localedirs ] srcdirs = [ os.path.normpath(os.path.abspath(s)) for s in sourcedirs ] + [srcdir] self.makeJar( infile=infile, sourcedirs=srcdirs, topsourcedir=topsourcedir, localedirs=l10ndirs, jardir=jardir) def processJarSection(self, jarfile, lines, jardir, sourcedirs, topsourcedir, localedirs): '''Internal method called by makeJar to actually process a section of a jar.mn file. jarfile is the basename of the jarfile or the directory name for flat output, lines is a pushback_iterator of the lines of jar.mn, the remaining options are carried over from makeJar. ''' # chromebasepath is used for chrome registration manifests # %s is getting replaced with chrome/ for chrome.manifest, and with # an empty string for jarfile.manifest chromebasepath = '%s' + jarfile if self.outputFormat == 'jar': chromebasepath = 'jar:' + chromebasepath + '.jar!' chromebasepath += '/' jarfile = os.path.join(jardir, jarfile) jf = None if self.outputFormat == 'jar': #jar jarfilepath = jarfile + '.jar' try: os.makedirs(os.path.dirname(jarfilepath)) except OSError: pass jf = ZipFile(jarfilepath, 'a', lock=True) outHelper = self.OutputHelper_jar(jf) else: outHelper = getattr(self, 'OutputHelper_' + self.outputFormat)(jarfile) register = {} # This loop exits on either # - the end of the jar.mn file # - an line in the jar.mn file that's not part of a jar section # - on an exception raised, close the jf in that case in a finally try: while True: try: l = lines.next() except StopIteration: # we're done with this jar.mn, and this jar section self.finalizeJar(jarfile, chromebasepath, register) if jf is not None: jf.close() # reraise the StopIteration for makeJar raise if self.ignore.match(l): continue m = self.regline.match(l) if m: rline = m.group(1) register[rline] = 1 continue m = self.entryline.match(l) if not m: # neither an entry line nor chrome reg, this jar section is done self.finalizeJar(jarfile, chromebasepath, register) if jf is not None: jf.close() lines.pushback(l) return self._processEntryLine(m, sourcedirs, topsourcedir, localedirs, outHelper, jf) finally: if jf is not None: jf.close() return def _processEntryLine(self, m, sourcedirs, topsourcedir, localedirs, outHelper, jf): out = m.group('output') src = m.group('source') or os.path.basename(out) # pick the right sourcedir -- l10n, topsrc or src if m.group('locale'): src_base = localedirs elif src.startswith('/'): # path/in/jar/file_name.xul (/path/in/sourcetree/file_name.xul) # refers to a path relative to topsourcedir, use that as base # and strip the leading '/' src_base = [topsourcedir] src = src[1:] else: # use srcdirs and the objdir (current working dir) for relative paths src_base = sourcedirs + ['.'] # check if the source file exists realsrc = None for _srcdir in src_base: if os.path.isfile(os.path.join(_srcdir, src)): realsrc = os.path.join(_srcdir, src) break if realsrc is None: if jf is not None: jf.close() raise RuntimeError( 'File "%s" not found in %s' % (src, ', '.join(src_base))) if m.group('optPreprocess'): outf = outHelper.getOutput(out) inf = open(realsrc) pp = self.pp.clone() if src[-4:] == '.css': pp.setMarker('%') pp.out = outf pp.do_include(inf) outf.close() inf.close() return # copy or symlink if newer or overwrite if (m.group('optOverwrite') or (getModTime(realsrc) > outHelper.getDestModTime( m.group('output')))): if self.outputFormat == 'symlink' and hasattr(os, 'symlink'): outHelper.symlink(realsrc, out) return outf = outHelper.getOutput(out) # open in binary mode, this can be images etc inf = open(realsrc, 'rb') outf.write(inf.read()) outf.close() inf.close() class OutputHelper_jar(object): '''Provide getDestModTime and getOutput for a given jarfile. ''' def __init__(self, jarfile): self.jarfile = jarfile def getDestModTime(self, aPath): try: info = self.jarfile.getinfo(aPath) return info.date_time except: return 0 def getOutput(self, name): return ZipEntry(name, self.jarfile) class OutputHelper_flat(object): '''Provide getDestModTime and getOutput for a given flat output directory. The helper method ensureDirFor is used by the symlink subclass. ''' def __init__(self, basepath): self.basepath = basepath def getDestModTime(self, aPath): return getModTime(os.path.join(self.basepath, aPath)) def getOutput(self, name): out = self.ensureDirFor(name) # remove previous link or file try: os.remove(out) except OSError, e: if e.errno != 2: raise return open(out, 'wb') def ensureDirFor(self, name): out = os.path.join(self.basepath, name) outdir = os.path.dirname(out) if not os.path.isdir(outdir): os.makedirs(outdir) return out