def __init__(self): self.epoch_duration = 60 self.vk_client = vk_autorization(LOGIN, PASSWORD) self.data = DataHandler() self.view = View(self, self.data.get_players(), self.data.get_settings()) self.view.show()
class Main(): #DataHandler object data_handler = None #DDoSDetector object DDoS_detector = None def __init__(self): self.data_handler = DataHandler() self.DDoS_detector = DDoSDetector() #gives the user a menu def menu(self): print("1) Train Model") print("2) Test model on dataset") print("3) Run model on live packets") print("0) Quit") choice = int(input("Choice: ")) #train model if choice == 1: print("Chose to train a model") pass #test model on dataset elif choice == 2: self.test_model() #Run model on live packets elif choice == 3: print("Chose to run the model on live packets") pass #Quits the program elif choice == 0: return False return True #menu choice to train a model def train_model(self): pass #menu choice to test a model def test_model(self): self.data_handler.print_dataset_list() choice = int(input("Dataset choice: ")) print("Choice: " + str(self.data_handler.get_dataset_path(choice))) self.DDoS_detector.test(choice) #menu choice to run the model on live packets def run_model(self): pass
def find_distance(i_): path_to_covers = '/home/anosov/data/hard_base/covers/case_%i.dump' % (i_, ) base, ext = os.path.splitext(path_to_covers) path_to_data = base + '__computing' + ext h = DataHandler(path_to_covers, path_to_data) tmp = h.distance_to_dead_ends h.dump()
def test(path, run_id, runs): # load parser and data handler parser = CYKParser.load(path) data_handler = DataHandler(config.test_set, run_id, runs) # parse sentences in parallel executor = ProcessPoolExecutor(config.processes) futures = [ executor.submit(parse_tree, parser, sent, run_id) for sent in data_handler.generator() ] # following code is to track progress kwargs = { 'total': len(futures), 'unit': 'nap', 'unit_scale': True, 'leave': True } for _ in tqdm(as_completed(futures), **kwargs): pass for future in futures: if future.exception() is not None: print(future.exception()) # stitch files if number of runs is 1 if runs == 1: stitch_files() print("Done parsing")
def __init__(self, ): # Initialize data loader self.data = DataHandler() # Initialize model self.ada_network = ADDANet()
def __init__(self,master): super().__init__(master) self.data_handler = DataHandler("data.json") self.category_lists = self.data_handler.get_categories_list() self.category_data_labels = [] self.element = 'Sports' #delete old labels.. for self.element in self.category_data_labels: self.element.configure(text = "") del self.element #add new labels for name.. self.list_data_for_category = self.data_handler.get_news_for_category(self.element) for i in range(len(self.list_data_for_category)): self.element = self.list_data_for_category[i] self.labelfont = ('times',15,'bold') self.labelcontent_font = ('times',10,'bold','italic') self.label_heading = Label(self, text = self.element['headline']) self.label_heading.config(font = self.labelfont) self.label_heading.grid(column = 0, row = 3*i) self.label_content = Label(self, text = self.element['content']) self.label_content.config(font = self.labelcontent_font) self.label_content.grid(column = 0, row = 3*i+1) self.label_content_separator = Label(self, text = '---------------------------') self.label_content_separator.grid(column = 0, row=3*i+2) self.category_data_labels.append(self.label_heading) self.category_data_labels.append(self.label_content) self.category_data_labels.append(self.label_content_separator) self.pack()
def __init__(self, number_of_epochs=10): self.verbose = True self.number_of_channels = 2 self.data_handler = \ DataHandler(number_of_channels=self.number_of_channels, number_of_negative_sets=50, number_of_positive_sets=50, number_of_test_sets=50, verbose=self.verbose) self.data_handler.load_training_data() self.data_handler.load_test_data() self.data_handler.preprocess_data() self.mini_batch_size = 1 self.model = CNN(number_of_channels=self.number_of_channels, number_of_filters=12, regularization_coefficient=1e0, learning_rate=0.001, filter_length=12, pool_size=512, fully_connected_layer_neurons=8, momentum=0.9, perform_normalization="no", update_type="adam", pool_mode="average_exc_pad") self.number_of_epochs = number_of_epochs self.training_errors = [] self.test_errors = [] self.classifier = SVC(C=11., kernel="rbf", gamma=1. / (2 * 2.85))
def perform_exp(self): list_of_scores = [] data_handler = DataHandler(self.data, self.var_dict) raw_X = data_handler.get_dummy_coded_data('dummy_only') n_cat_dummy_var = raw_X.shape[1] - len(self.var_dict['numerical_vars']) raw_clf_scores = self._get_classification_score(raw_X) list_of_scores.append( ('raw', raw_clf_scores, raw_X.shape[1] - n_cat_dummy_var)) for n_init_bins in self.n_init_bins_list: sb_X = self.semantic_binning.fit_transform(self.data, n_init_bins) sb_clf_scores = self._get_classification_score(sb_X) list_of_scores.append(('sb_{}'.format(n_init_bins), sb_clf_scores, sb_X.shape[1] - n_cat_dummy_var)) for n_bins in self.n_bins_range: ew_X = data_handler.get_dummy_coded_data('equal_width', n_bins) ew_clf_scores = self._get_classification_score(ew_X) list_of_scores.append(('ew_{}'.format(n_bins), ew_clf_scores, ew_X.shape[1] - n_cat_dummy_var)) ef_X = data_handler.get_dummy_coded_data('equal_freq', n_bins) ef_clf_scores = self._get_classification_score(ef_X) list_of_scores.append(('ef_{}'.format(n_bins), ef_clf_scores, ef_X.shape[1] - n_cat_dummy_var)) self.list_of_scores = list_of_scores print('Experiment Finished !. Result Saved in Exp Instance..')
def download_and_classify_in_batches(complete_links_list, classifier): print("Total amount of images to be downloaded and classified: %d" % len(complete_links_list)) for index in range(0, len(complete_links_list), BATCH_SIZE): time_start = time.time() print("Downloading and classifying batch: %d -> %d" % (index, index + BATCH_SIZE)) links_batch = complete_links_list[index:index + BATCH_SIZE] tensor_images = ImageDownloader.download_images( links_batch, NUM_DOWNLOAD_THREADS) if len(tensor_images) == 0: print("Skipping classification of empy list") continue results = classifier.classify_image_tensors(tensor_images) results_df = DataHandler.convert_classification_result_to_dataframe( results) DataHandler.write_classification_result(results_df, PARQUET_FILE_OUTPUT_LOCATION) duration = time.time() - time_start print("Duration of donwloading and classification for batch: %.2f" % duration)
def runServer(self): Logger.writeInfo("Opening socket...") sock = socket(AF_INET, SOCK_STREAM) sock.bind((self.host, self.port)) sock.listen(10) handler = DataHandler() try: while True: conn, addr = sock.accept() Logger.writeInfo("Connected by {}".format(addr)) try: data = Server.receiveAll(conn) if data != None: response = handler.process(data) conn.sendall(response) conn.close() except Exception as e: Logger.writeError(str(e)) Logger.writeInfo("Disconnected {}".format(addr)) finally: Logger.writeInfo("Socket closed") sock.close() handler.close()
def get_by_distance(): distance = request.args.get("dist") latitudeN = request.args.get("latN") longitudeE = request.args.get("lonE") dh = DataHandler() return dh.select_schools_by_distance(float(latitudeN), float(longitudeE), float(distance))
def __init__(self, server_socket, buffer, socket_ip): self.names = [] self.record = {} self.buffer = buffer self.connected_list = [] self.socket_ip = socket_ip self.data_handler = DataHandler(socket_ip) self.server_socket = server_socket self.add_connection(server_socket)
def dataLoader(self): data_handler = DataHandler() npz = data_handler.npzLoader(self.target_file) data, label = npz[0], npz[1] data /= 255.0 return data, label
def __init__(self, file_sys_broadcast_addr, task_ping_addr, current_worker_addr, job_url, job_id, function_url, block_urls, task, answer_addr, load_byte, status_db_url, block_id): self.status_handler = StatusHandler( status_db_url, self._verify_if_errors_in_fs, self._reset_method_if_no_answer_from_fs) self.data_handler = DataHandler( job_url, job_id, self._verify_if_errors_in_fs, self._reset_method_if_no_answer_from_fs) self.task_ping_addr = task_ping_addr self.file_sys_broadcast_addr = file_sys_broadcast_addr self.file_sys_addrs = [] self.answer_addr = answer_addr self._update_filesystem_nodes() self.file_sys_addr = self._get_new_filesystem_node() self.job_url = job_url self.job_id = job_id self.function_url = function_url self.block_urls = block_urls # print("Task_Exc: Este es el block_urls: ",self.block_urls) self.task = task self.load_byte = load_byte self.current_worker_addr = current_worker_addr self.status_db_url = status_db_url self.block_id = block_id self.map_fun, self.red_fun, self.comb = self.get_func() self.record_readers = { True: self.record_reader_byte, False: self.record_reader_str } self.execute_methods = { 'map': self.execute_map_task, 'reduce': self.execute_reduce_task } self.start_listen_pings() # nos aseguramos que si se cayo el master en el momento que me mando el mensaje de task, yo mismo pongo en # submitted el bloque to_update = [('state', mt.slices_states[1]), ('worker_ip', self.task_ping_addr[0]), ('worker_port', self.task_ping_addr[1])] print( "Task_Exc: ", "Salvamos el estado del bloque {} a SUBMITTED en el filesys: {}". format(block_id, self.file_sys_addr)) self.status_handler.update_status_row(self.file_sys_addr, 'block', ('block_id', block_id), to_update) self.execute_task = self.execute_methods[self.task] self.record_reader = self.record_readers[load_byte]
def load_data(db_name, label="training_0000", n_imgs=None, thresh=1e5, step_size=1, db_dir=None): """ loads rgb images and targets from an hdf5 database and returns them as a np array Expects data to be saved in the following group stucture: Training Data training_0000/data/0000 using %04d to increment data name Validation Data validation_0000/data/0000 using %04d to increment data name Both return an array with the rgb image saved under the 'rgb' key and the target saved under the 'target' key Parameters ---------- db_name: string name of database to load from label: string, Optional (Default: 'training_0000') location in database to load from n_imgs: int how many images to load """ # TODO: specify the data format expected in the comment above dat = DataHandler(db_dir=db_dir, db_name=db_name) # load training images images = [] targets = [] skip_list = ["datestamp", "timestamp"] keys = np.array([ int(val) for val in dat.get_keys("%s" % label) if val not in skip_list ]) n_imgs = max(keys) if n_imgs is None else n_imgs print("Total number of images in dataset: ", max(keys)) for nn in range(0, n_imgs, step_size): data = dat.load(parameters=["rgb", "target"], save_location="%s/%04d" % (label, nn)) if np.linalg.norm(data["target"]) < thresh: images.append(data["rgb"]) targets.append(data["target"]) images = np.asarray(images) targets = np.asarray(targets) print("Total number of images within threshold: ", images.shape[0]) return images, targets
def store_analyzed_reviews(self, hotel_name, aspect_details, platforms): for key, value in self.aspect_details.items(): value.review_list = {} to_json = json.dumps(aspect_details, cls=MyEncoder) data_handler = DataHandler() data_handler.set_analyzed_reviews(hotel_name, to_json, platforms) # ra = ReviewAnalyzer() # ra.get_analyzed_reviews('Kingsbury', ['ALL'])
def test_get_hist_prices(self): """Given a pre-defined set of inputs the test checks if the method correctly computes the results""" data_handler = DataHandler('') method_handler = MethodHandler(data_handler, '') data = {dt.datetime(2021, 1, 1): 120, dt.datetime(2021, 1, 2): 122, dt.datetime(2021, 1, 3): 123, dt.datetime(2021, 1, 4): 122, dt.datetime(2021, 1, 5): 119, dt.datetime(2021, 1, 8): 118, dt.datetime(2021, 1, 9): 120, dt.datetime(2021, 1, 10): 122} data_handler.load_db_manually('MSFT', data) res_test = {dt.datetime(2021, 1, 3): 123, dt.datetime(2021, 1, 4): 122, dt.datetime(2021, 1, 5): 119, dt.datetime(2021, 1, 8): 118, dt.datetime(2021, 1, 9): 120} res = method_handler.get_hist_prices('MSFT', dt.datetime(2021, 1, 3), dt.datetime(2021, 1, 9), '') self.assertEqual(res, res_test)
def online_eval(): # evaluation code for online test handler = DataHandler() data = handler.generate_data(TRAIN_FILENAME) testing_data = handler.generate_data(TEST_FILENAME, "test") ann = ANN(9, 10, 1) for i in range(80): print(i + 1) ann.train(data, 5000) result = ann.test_without_true_label(testing_data, 0.23) handler.write_to_result(TEST_FILENAME, result)
def get_analyzed_reviews(self, hotel_name, platforms): data_handler = DataHandler() reviews = data_handler.get_analyzed_reviews(hotel_name, platforms) print(hotel_name) print(reviews) if reviews is None: self.analyze_reviews(hotel_name, platforms) reviews = data_handler.get_analyzed_reviews(hotel_name, platforms) if reviews is None: return None return reviews[2]
def __init__(self, **kwargs): super().__init__(**kwargs) self.ui = Ui_MainWindow() self.ui.setupUi(self) self.ui.graphicsView.setBackground(pg.mkColor(0.3)) self.plot_box = self.ui.graphicsView.addViewBox(row=1, col=1, lockAspect=True, enableMouse=True, invertY=True) self.image_item = pg.ImageItem() self.image_item.setOpts(axisOrder='row-major') self.plot_box.addItem(self.image_item) self.roi = None self.ui.selectDataButton.toggled.connect(self.show_roi) self.ui.resetSelectDataButton.clicked.connect(self.reset_roi) self.settings_layout = QHBoxLayout() self.settings_widget = QWidget() self.settings_layout.addWidget(self.settings_widget) self.ui.camSettingsWidget.setLayout(self.settings_layout) self.data_handler = DataHandler() for plugin in self.data_handler.plugins: self.add_plugin(plugin.get_widget(), plugin.name) self.data_handler.ndarray_available.connect(self.show_ndarray) self.data_handler.camera_controls_changed.connect( self.set_camera_controls) self.ui.actionSave_image.triggered.connect(self.data_handler.save_file) self.data_handler.enable_saturation_widget.connect( self.enable_saturation_bar) self.data_handler.saturation_changed.connect( self.ui.progressBar.setValue) self.data_handler.message.connect(self.show_message) self.camera_dialog = CameraDialog() self.ui.actionChoose_camera.triggered.connect( self.camera_dialog.choose_camera) self.camera_dialog.camera_changed.connect( self.data_handler.change_camera) self.camera_dialog.choose_first_camera() self.ui.actionTune_camera_parameters.triggered.connect(self.tune_pid) self.ui.actionShow_Settings.toggled.connect(self.show_settings) self.ui.actionDraw_lines.toggled.connect(self.draw_lines) self.hline = None self.vline = None
def __init__(self, worker_broadcast_addr, filesystem_broadcast_addr, tracker_addr_ping, tracker_ip, current_worker_addr, job_url, job_id, data_type, client_addr, functions_url, map_data_url, status_db_url): self.worker_broadcast_addr = worker_broadcast_addr self.status_handler = StatusHandler( status_db_url, self._verify_if_errors_in_fs, self._reset_method_if_no_answer_from_fs) self.data_handler = DataHandler( job_url, job_id, self._verify_if_errors_in_fs, self._reset_method_if_no_answer_from_fs) self.filesystem_broadcast_addr = filesystem_broadcast_addr self.job_url = job_url self.file_sys_addrs = [] self._update_filesystem_nodes() self.file_sys_addr = self._get_new_filesystem_node() self.client_addr = client_addr self.job_id = job_id self.current_worker_addr = current_worker_addr self.data_type = data_type self.states = ["map", "reduce"] self.job_phase = self.states[0] self.veto_workers = [] self.tracker_ip = tracker_ip self.tracker_addr_ping = tracker_addr_ping self.tracker_addr = (tracker_ip, '8080') self.delimiters = [' ', '\n'] self.map_results = None self.result_data_url = '{}/result_data'.format(self.job_url) self.map_data_url = map_data_url self.functions_url = functions_url self.status_db_url = status_db_url self.phases = [ 'GETWORKERS', 'SLICES', 'SENDTASK', 'WAITANSWERS', 'GETRESULTS', 'DONE' ] self.load_job_methods = { 'GETWORKERS': self.getting_workers, 'SLICES': self.getting_workers, 'SENDTASK': self._load_send_task_phase, 'WAITANSWERS': self._load_wait_results, 'GETRESULTS': self.getting_results, } self.status_phase = mt.task_phases[0] self.pinging_process = None self.get_data = self.data_handler.get_line_by_line_str
def solver_profiling(): path_to_covers = '/home/anosov/data/hard_base/covers/case_0.dump' h = DataHandler(path_to_covers) from LG.solver import Solver as LG_Solver i_ = 0 f = h.product_field(i_) for i in xrange(1): s = LG_Solver(f) a = s.run() b = s.alternative_path_lens() print a print [h.cells[i] for i in h.finished_packed_paths[i_]]
def get_pre_trained_model(): pre_trained_model = InceptionV3(input_shape=(image_size, image_size, 3), include_top=False, weights=None) DataHandler.extract_inception_model(local_weights_file) pre_trained_model.load_weights(local_weights_file) for layer in pre_trained_model.layers: layer.trainable = False print(pre_trained_model.summary()) return pre_trained_model
def __init__(self, input_size=64, hidden_size=64, n_filters=16): # Copy params self.input_size = input_size self.hidden_size = hidden_size self.n_filters = n_filters # Initialize data loader self.data = DataHandler(image_size=input_size) # Initialize model self.began_network = BEGANNet(input_size=input_size, hidden_size=hidden_size, n_filters=n_filters)
def solver_profiling(): path_to_covers = '/home/anosov/data/hard_base/covers/case_0.dump' h = DataHandler(path_to_covers) from LG.solver import Solver as LG_Solver i_ = 0 f = h.product_field(i_) for i in xrange(1): s = LG_Solver(f) a = s.run() b = s.alternative_path_lens() print a print[h.cells[i] for i in h.finished_packed_paths[i_]] print b
class NeuralNetwork: def __init__(self): self.data_handler = DataHandler() self.network_model = NetworkModel() def train(self, args): # Loading dataset network_input, network_output, vocab_length = self.data_handler.load_dataset( args) model = self.network_model.create(network_input, vocab_length) # callbacks stop_training = StopTrainingCallback() checkpoint = tf.keras.callbacks.ModelCheckpoint( constant.MODEL_PATH, monitor="acc", verbose=1, save_best_only=True, save_weights_only=False, save_freq='epoch') history = model.fit(network_input, network_output, epochs=constant.EPOCHS, batch_size=constant.BATCH_SIZE, callbacks=[checkpoint, stop_training]) self.network_model.plot_loss_and_accuracy(args, history) return model def run(self, args): model = tf.keras.models.load_model(args["model"]) with open(args["notes"], 'rb') as notes_path: notes = pickle.load(notes_path) pitches = sorted(set(item for item in notes)) vocab_length = len(set(notes)) with open(args["partition_info"], 'rb') as partition_info_path: partition_info = pickle.load(partition_info_path) network_input, network_output = self.data_handler.prepare_sequences( notes, partition_info['sequence_length'], vocab_length) prediction_output = self.network_model.generate_notes( model, network_input, pitches, vocab_length) self.data_handler.save_midi(partition_info, prediction_output)
def system(self): if not login.current_user.is_authenticated: return redirect(url_for('.login_view')) self.disk = DataHandler.getInstance().disk self.header = "System" return render_template('sb-admin/pages/system.html', admin_view=self)
def applications(self): if not login.current_user.is_authenticated: return redirect(url_for('.login_view')) self.apps = DataHandler.getInstance().apps self.header = "Applications" return render_template('sb-admin/pages/applications.html', admin_view=self)
def train(self, num_classifiers=50): bagged_datasets = DataHandler.create_bagged_datasets( num_classifiers, self.examples, self.targets) for bagged_dataset in bagged_datasets: naive_bayes = NaiveBayes(bagged_dataset[0], bagged_dataset[1]) naive_bayes.train() self.nb_classifiers.append(naive_bayes)
def train(self, forest_size=50, tree_depth=10): self.forest = [] bagged_datasets = DataHandler.create_bagged_datasets( forest_size, self.examples, self.targets) for bagged_dataset in bagged_datasets: examples = bagged_dataset[0] targets = bagged_dataset[1] num_attributes = len(examples[0]) num_chosen_attr = int(sqrt(num_attributes)) while num_chosen_attr > len(examples[0]): DataHandler.rm_column(examples, random.randint(1, len(examples[0]) - 1)) id3 = ID3(examples, targets) id3.train(tree_depth) self.forest.append(id3)
def main(): opts = configs.model_config os.environ["CUDA_VISIBLE_DEVICES"] = '0' gpu_config = tf.ConfigProto(device_count={'GPU': 1}, allow_soft_placement=False, log_device_placement=False) gpu_config.gpu_options.allow_growth = True sess = tf.Session(config=gpu_config) print('starting processing data ...') data = DataHandler(opts) print('starting initialising model ...') opts['r_range_upper'] = data.train_r_max opts['r_range_lower'] = data.train_r_min model = Model_Decon(sess, opts) opts['batch_size'] = 1 opts['va_sample_num'] = 6 opts['model_bn_is_training'] = False print('starting testing policy using AC_Decon ...') ac = AC_Decon(sess, opts, model) ac.policy_test(data)
def train_epoch(self, optimizer, training_data, epoch_id='unknown'): epoch_time = time.time() accumulated_loss = 0 average_losses = [] training_data_length = len(training_data) percent_done = 0 for index, data in enumerate(training_data): sample, target = FloatTensor([[data[0]]]), FloatTensor([data[1]]) if torch.cuda.is_available(): sample, target = sample.cuda(0), target.cuda(0) sample, target = Variable(sample), Variable(target) optimizer.zero_grad() output = self(sample) loss = self.criterion(output, target) loss.backward() optimizer.step() accumulated_loss += loss.data[0] if percent_done - 100 * index // training_data_length != 0: percent_done = 100 * index // training_data_length average_losses.append(accumulated_loss/(index+1)) print('Finished %s%% of epoch %s | average loss: %s' % (percent_done, epoch_id+1, accumulated_loss/(index+1))) print "Successively trained %s epochs (epoch timer: %s)" % (epoch_id+1, DataHandler.format_time(time.time() - epoch_time)) return average_losses
def __init__(self): # mechanize.RobustFactory() pozwala na prawidlowe odczytywanie stron z bledami(niepozamykane znaczniki, itp.) self.br = mechanize.Browser(factory=mechanize.RobustFactory()) # olewa reguły z robots.txt self.br.set_handle_robots(False) # musimy udawać prawdziwą przeglądarkę, inaczej google nas nie puści :D self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux i686; rv:7.0.1) Gecko/20100101 Firefox/7.0.1')] # inicjalizacja bazy self.db = DataHandler()
from flask import Flask, request, session, g, redirect, url_for, abort, \ render_template, flash, _app_ctx_stack from data_handler import DataHandler import json dataHandler = DataHandler() dataHandler.setup_tfidfMatcher() app = Flask(__name__) @app.route('/layout.html') def handle_layout(): return render_template('layout.html') @app.route('/') @app.route('/index.html') def handle_index(): resumes = dataHandler.get_resumes() return render_template('index.html', resumes=resumes) @app.route('/add_resume', methods=['POST', 'GET']) def add_resume(): error = None if request.method == 'POST': resume_text = request.form['resume_txt'].strip() # print "resume_text:", resume_text if len(resume_text) > 0 : dataHandler.save_resume(resume_text)
def plot_iter(result): width = 10 x = np.arange(10, 101, 10) plt.ylim(0.6, 0.68) plt.ylabel("Precision") plt.xlabel("Iteration") plt.bar(x, [val for val in result], width, color="#ababab") plt.show() # plot node number in hidden layer figure def plot_node(result): width = 0.5 x = np.arange(1, 11, 1) plt.ylim(0.62, 0.66) plt.ylabel("Precision") plt.xlabel("Hidden Layer Node Number") plt.bar(x, [val for val in result], width, color="#ababab") plt.show() if __name__ == "__main__": handler = DataHandler() data = handler.generate_data(TRAIN_FILENAME) iteration_test(data) node_test(data) layer_test(data) cross_validation(5, 500, data) online_eval()
from data_handler import DataHandler from matrix_generator import MatrixGenerator c = DataHandler(); s = c.getData(); print s; a = MatrixGenerator(s); lis = a.getMatrix(); for index in range(len(lis)): print lis[index]
class Tracker(object): def __init__(self): # mechanize.RobustFactory() pozwala na prawidlowe odczytywanie stron z bledami(niepozamykane znaczniki, itp.) self.br = mechanize.Browser(factory=mechanize.RobustFactory()) # olewa reguły z robots.txt self.br.set_handle_robots(False) # musimy udawać prawdziwą przeglądarkę, inaczej google nas nie puści :D self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux i686; rv:7.0.1) Gecko/20100101 Firefox/7.0.1')] # inicjalizacja bazy self.db = DataHandler() def askGoogle(self,question): self.br.open('http://google.pl') #wybieramy sobie formularz na stronie (można wybierać też po nazwie, ale na googlach jest tylko jeden, stąd po numerku) self.br.select_form(nr=0) # probojemy zaladowac dane z bazy danych try: links = self.db.load_search(question) except KeyError: # jesli danych nie ma w bazie to pytamy googla self.br.form['q'] = question + ' dyskusja' #chcemy tylko strony z dyskusją self.br.submit() #do results włazi po prostu otwarty html results = self.br.response().read() print results[string.find(results,"Około "):string.find(results, "wyników")]+"wyników:" #soup to obiekt gotowy do parsowania self.soup = BeautifulSoup(results) print self.soup.findAll('a', attrs={'class':'l'}) links = [x['href'] for x in self.soup.findAll('a', attrs={'class':'l'})] #druga strona wyników fl_tags = self.soup.findAll('a', attrs = {'class':'fl'}) second_page = '' for tag in fl_tags: if tag.findAll(text = '2') != []: second_page = tag['href'] print 'adres drugiej strony:',second_page self.br.open(second_page) self.soup = BeautifulSoup(self.br.response().read()) links.extend([x['href'] for x in self.soup.findAll('a', attrs={'class':'l'})]) print "\n".join(links) self.db.add_search(question,links) #wyrzucamy linki, ktore zbiorą ocenę 0.0 - raczej nie są interesujące links = filter(lambda url: page_rater.rate_URL(url, self.db) > 0.0, links) #sortujemy względem aktywności links.sort(key = lambda url: page_rater.rate_URL(url, self.db), reverse = True) print "Sorted" return links def getSerializedStats(self, links): return map(lambda link: pickle.dumps(self.db.load_link(link)), links) def getStats(self, link): return self.db.load_link(link) #tutaj dobrze by bylo sprawdzac, czy forum spelnia jakies tam wymagania (np. czy to phpBB) def openForum(self,URL): self.br.open(URL) results = self.br.response().read() self.soup = BeautifulSoup(results) return self.__getSections() def __getSections(self): res = [] forumtitles = self.soup.findAll('a', attrs={'class':'forumtitle'}) for forumtitle in forumtitles: title = u'DZIAŁ: ' + forumtitle.next desc = u'OPIS: ' + forumtitle.next.next.next.strip() print title print desc print u'forumtitle: ' + forumtitle['href'] print forumtitle.parent.findNextSibling('dd', attrs={'class':'topics'}).next.next.text + ':' print forumtitle.parent.findNextSibling('dd', attrs={'class':'topics'}).next print forumtitle.parent.findNextSibling('dd', attrs={'class':'posts'}).next.next.text + ':' print forumtitle.parent.findNextSibling('dd', attrs={'class':'posts'}).next print '-------------------------------' res.append(title + '\n' + desc + '\n-------------------------------') return res
from jobanalysis.jobdescparser import JobDescParser from filetotxt import fileToTxt from jobanalysis.resume import resumeparser from jobanalysis.similarity.modelsimilarity import ModelSimilarity import indexer dbinfo = {} dbinfo["pagesize"] = 20 dbinfo['dbname'] = "jobaly" dbinfo['collname'] = "keyword_info_java" dbinfo['modelcollname'] = dbinfo['collname']+"_model" app = Flask(__name__) dataHandler = DataHandler() dataHandler.connectJobColl(dbinfo['dbname'] , dbinfo['collname']) UPLOAD_FOLDER = 'uploads/' ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'doc', 'docx']) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER app.config['resume'] = "" app.config['resume_name'] = "" app.config['keyword'] = "" app.config['matchjids'] = None similarity = ModelSimilarity() @app.route('/layout.html') def handle_layout(): return render_template('layout.html')