def extract_metadata(self): if not hasattr(self, "_extract_metadata_cache"): if self.base_file: e = extractor.Extractor() self._extract_metadata_cache = e.extractFromFile(self.base_file.path.encode()) elif self.url_field: e = extractor.Extractor() http = httplib2.Http() response, body = http.request(self.url_field) self._extract_metadata_cache = e.extractFromData(data=body, size=len(body)) else: self._extract_metadata_cache = None return self._extract_metadata_cache
def on_idle(self): while (not self.in_q.empty()): temp = self.in_q.get() if type(temp) is list: #draw contours contours = temp[0] color = temp[1] primary_label = temp[2] normals = temp[3] self.make_display_lists(contours, color / 255.0, primary_label, normals) self.draw() else: #add new extractor args = temp.split() location = (int(args[0]), int(args[1]), int(args[2])) primary_id = [] secondary_ids = [] split_str = re.split(":", args[3]) primary_id = [int(split_str[0])] if split_str[1] != "": secondary_ids = [ int(label) for label in re.split(',', split_str[1]) ] ids = [primary_id + secondary_ids] extr = extractor.Extractor(self.in_q, self.directory, ids, location, max_x, max_y) extracting_worker = threading.Thread(target=extr.run, name="extr") extracting_worker.daemon = True extracting_worker.start()
def process_code(self, code, reduce_mode_mini=False): E = extractor.Extractor([ code, ], self) E.extract() if len(self.years) > 1: max_year = max(self.years) else: max_year = max(self.years) + 1 if self.aggregate_visits and not code.startswith('C'): aggregate_visits.aggregate_events( code, E.get_inpatient_visits(code), self.identifier, self.result_dir, reduce_mode_mini=reduce_mode_mini) if self.aggregate_readmits: unlinked_nodes, nodes, edges = E.get_readmit_nodes_edges(code) aggregate_edges.readmits.aggregate_readmits( code, unlinked_nodes, nodes, edges, self.identifier, self.result_dir, max_year) if self.aggregate_revisits: subsets = E.get_revisit_nodes_edges(code) for k, v in subsets.iteritems(): aggregate_edges.revisits.aggregate_revisits( k, v['unlinked_nodes'], v['nodes'], v['edges'], self.identifier, self.result_dir, max_year) if self.aggregate_patients: aggregate_patients.aggregate_patients(code, E.get_patients(code), self.identifier, self.result_dir, self.patients)
def read_metadata_local(inputfile, verbose): """ Metadata extraction from many kind of files @param inputfile: path to the image @type inputfile: string @param verbose: verbosity @type verbose: int @rtype: dict @return: dictionary with metadata """ # Initialization dict meta_info = {} # Extraction xtract = extractor.Extractor() # Get the keywords keys = xtract.extract(inputfile) # Loop to dump data to the dict for keyword_type, keyword in keys: meta_info[keyword_type.encode('iso-8859-1')] = \ keyword.encode('iso-8859-1') # Return the dictionary return meta_info
def groupDeduction(measure): # Feature extraction ext = extractor.Extractor() if measure == constants.IDENTICAL_WORDS: predictedLabels, predictedSets = ext.identicalWordsGroupBaseline( prr.testMeanings, prr.testLanguages, rdr.wordforms) elif measure == constants.IDENTICAL_PREFIX: predictedLabels, predictedSets = ext.identicalPrefixGroupBaseline( prr.testMeanings, prr.testLanguages, rdr.wordforms) elif measure == constants.IDENTICAL_LETTER: predictedLabels, predictedSets = ext.identicalFirstLetterGroupBaseline( prr.testMeanings, prr.testLanguages, rdr.wordforms) trueLabels = ext.extractGroupLabels(rdr.cognateSets, rdr.wordforms, prr.testMeanings, prr.testLanguages) # Evaluation lrn = learner.Learner() V1scores = { meaningIndex: lrn.computeV1(trueLabels[meaningIndex], predictedLabels[meaningIndex]) for meaningIndex in prr.testMeanings } # Reporting output.reportGroup(constants.DEDUCERS[measure], V1scores, rdr.meanings) output.saveGroup("output/Group " + constants.DEDUCERS[measure] + ".txt", predictedSets)
def pairwiseDeduction(measure): # Feature extraction ext = extractor.Extractor() if measure == constants.IDENTICAL_WORDS: ext.identicalWordsBaseline(prr.examples, prr.labels) elif measure == constants.IDENTICAL_PREFIX: ext.identicalPrefixBaseline(prr.examples, prr.labels) elif measure == constants.IDENTICAL_LETTER: ext.identicalFirstLetterBaseline(prr.examples, prr.labels) predictions = ext.testExamples.reshape((ext.testExamples.shape[0], )) # Evaluation lrn = learner.Learner() accuracy = lrn.computeAccuracy(ext.testLabels, predictions) F1 = lrn.computeF1(ext.testLabels, predictions) report = lrn.evaluatePairwise(ext.testLabels, predictions) # Reporting output.reportPairwiseDeduction(constants.DEDUCERS[measure], prr, accuracy, F1, report) output.savePredictions( "output/Pairwise " + constants.DEDUCERS[measure] + ".txt", prr.examples[constants.TEST], ext.testExamples, predictions, ext.testLabels) return predictions
def run(self): sys.stdout.write('[nzb2http][downloader] Started\n') self.stop_requested = False self.extractor = extractor.Extractor(self.get_first_rar_path()) self.extractor.start() for incomplete_file in self.incomplete_files: map_result_async = self.pool.map_async(_run_worker, incomplete_file.segments) while not self.stop_requested: try: map_result = map_result_async.get(1) self._write_nzb_file(incomplete_file, map_result) sys.stdout.write( '[nzb2http][downloader] Downloaded {0}\n'.format( incomplete_file.path)) break except multiprocessing.TimeoutError: pass self.pool.terminate() self.pool.join() sys.stdout.write('[nzb2http][downloader] Stopped\n')
def __init__(self, fld, pool): self.creation_date = datetime.now() self.pool = pool self.fld = fld self.rowid = None self.thread_id = uuid.uuid4().hex self.extractor = extractor.Extractor(self.fld, None, self.pool) self.crawl_counter = 0
def extract(input_file): sys.path.append('./scripts') import extractor e = extractor.Extractor(input_file, 'images', True, False, False, '127.0.0.1', None) ocwd = os.getcwd() (iid, repeated) = e.extract() os.chdir(ocwd) return (iid, repeated)
def HK2011Pairwise(twoStage=False): # 1st Pass # Feature extraction ext = extractor.Extractor() ext.HK2011Baseline(prr.examples, prr.labels) # Learning lrn = learner.Learner() lrn.initSVM(0.1) lrn.fitSVM(ext.trainExamples, ext.trainLabels) # Prediction predictions1 = lrn.predictSVM(ext.testExamples) # Evaluation accuracy = lrn.computeAccuracy(ext.testLabels, predictions1) F1 = lrn.computeF1(ext.testLabels, predictions1) report = lrn.evaluatePairwise(ext.testLabels, predictions1) # Reporting stage = "HK2011 1st Pass" output.reportPairwiseLearning(stage, prr, accuracy, F1, report) output.savePredictions("output/" + stage + ".txt", prr.examples[constants.TEST], ext.testExamples, predictions1, ext.testLabels) # 2nd Pass if twoStage: # Feature extraction ext.appendBinaryLanguageFeatures(prr.examples, prr.labels, constants.TEST, prr.testLanguages) # Learning lrn = learner.Learner() lrn.initSVM(0.0001) lrn.fitSVM(ext.testExamples, predictions1) # Prediction predictions2 = lrn.predictSVM(ext.testExamples) # Evaluation accuracy = lrn.computeAccuracy(ext.testLabels, predictions2) F1 = lrn.computeF1(ext.testLabels, predictions2) report = lrn.evaluatePairwise(ext.testLabels, predictions2) # Reporting stage = "HK2011 2nd Pass" output.reportPairwiseLearning(stage, prr, accuracy, F1, report) output.savePredictions("output/" + stage + ".txt", prr.examples[constants.TEST], ext.testExamples, predictions2, ext.testLabels) # Significance print constants.SIGNIFICANCE.format( lrn.computeMcNemarSignificance(ext.testLabels, predictions1, predictions2)) return ext, lrn
def main(): e = extractor.Extractor() # e.loadFilesToExtract() # extracts layers into images # e.generate(20) c = cutout.Cutout( "E:\\Projects\\2019\\The Game\\steam\\generator\\gmic\\gmic.exe", "E:\\Projects\\2019\\The Game\\steam\\generator\\skript\\output\\product\\myCutout.gmic", "E:\\Projects\\output", "E:\\Projects\\output\\out") c.runCutout()
def read_file(request): global data, urls, ext ext = extractor.Extractor('fld.com', None, p) p.database.setup_database() p.disable_processing = True urls = {'https://fld.com/arg', 'http://fld.com/monster/depth', 'https://fld.com/robot/wut' } with open('tests/utility/extractor_test_website.html') as f: data = f.read()
def import_topo_to_postgis(path): '''Import data from the BD TOPO IGN into the postgis database''' bd_topo_extractor = extractor.Extractor(path) bd_topo_extractor.get_files_by_format('shp') bd_topo_extractor.bulk_create() for shapefile in bd_topo_extractor.tables: bd_topo_extractor.insert_data_from_shapefile(shapefile, **bd_topo_extractor.tables[shapefile]) #bd_topo_extractor.commit_to_database() import_to_geoserver(bd_topo_extractor)
def treeFeatureSelection(): # Feature extraction ext = extractor.Extractor() ext.appendWordSimilarityFeatures(prr.examples, prr.labels, ext.allMeasures) # Feature selection lrn = learner.Learner() lrn.initForest(250, 0) lrn.fitForest(ext.trainExamples, ext.trainLabels) importances = lrn.getForestImportances() # Reporting for i, feature in enumerate(ext.allMeasures): print "{0}: {1:.4f}".format(feature, importances[i])
def on_idle(self): timer = time.time() while (not self.in_q.empty() and time.time() - timer < .1): self.icon_color = ( self.icon_color + .01 ) % 1 #resets to black when icon is green since 1.0 and 0.0 %1 are equal temp = self.in_q.get() if temp[0] == "marker": self.pick_location = temp[1:][0] self.pick_location[0] = int( float(self.pick_location[0] * self.columns) / self.max_x) self.pick_location[1] = int( float(self.pick_location[1] * self.rows) / self.max_y) elif temp[0] == "ids": self.num_labels += 1 label_idx = self.num_labels self.label_dict[label_idx] = temp[1:][0][0][0] extr = extractor.Extractor(self.in_q, self.directory, temp[1:][0], self.pick_location, self.max_x, self.max_y, label_idx) self.extractor_dict[temp[1][0][0]] = extr extracting_worker = threading.Thread(target=extr.run, name="extr") extracting_worker.daemon = True extracting_worker.start() elif temp[0] == "contours": contours = temp[1] color = temp[2] primary_label = temp[3] normals = temp[4] label_idx = temp[5] if self.make_lists: self.make_display_lists(contours, color / 255.0, primary_label, normals, label_idx) elif temp[0] == "limits": self.max_x = temp[1] self.max_y = temp[2] self.layers = temp[3] elif temp[0] == "refresh": self.refresh() elif temp[0] == "remove": self.remove_label(temp[1:][0]) self.st = time.time() glutPostRedisplay() #set icon to green if processes are done if time.time() - self.st > 0.25: self.icon_color = np.array((0.0, 1.0, 0.0)) self.make_lists = True glutPostRedisplay()
def setUp(self): self.extract = extractor.Extractor() self.extract.ROOT_DIR = 'C:\\SIS' self.extract.EXCLUDE_DIRS = [r'.\SIS\ACBr', r'.\SIS\SISMobile', r'.\SIS\SISDLL'] self.extract.EXCLUDE_FILES_WITH = ['900A.dfm', 'BARVERTICAL', 'Frame', 'PAI'] self.extract.VALID_EXTENSION = '.dfm' self.extract.CSV_FILE = 'result.csv' self.extract.formList = {} if os.path.isfile('result.csv'): os.remove('result.csv')
def extract_metadata(sender, instance, field_mapping, force=False): """ Extract and populate metadata from the file itself. @force: Overwrite existing metadata """ import extractor as libextractor extractor = libextractor.Extractor(lang="en") if not extractor: return all_keywords = extractor.extract(data=instance.file.read(), size=instance.file.size) keywords = dict(all_keywords) for attr, field in field_mapping.items(): if field in keywords and (force or not hasattr(instance, attr)): # 1. Extract data value = keywords[field].encode('iso-8859-1') # 2. Post-extraction processing try: value = getattr(instance, 'process_metadata_%s' % attr)(value) # No value processing defined, maybe try some basic automatic processing except AttributeError: # Date/time processing if isinstance(instance._meta.get_field(field), (models.DateField, models.DateTimeField)): for pattern in ('%Y-%m-%dT%H:%M:%SZ', '%Y%m%d%H%M%S'): try: # String is trimmed to the size of pattern, assuming that # it is the same length as the string it is matching (coincidently, it often is!). value = datetime.strptime(value[:len(pattern)], pattern) except ValueError: continue # 3. Set the discovered value if value: setattr(instance, attr, value) # TODO: Other keywords might have multiple values, it would be better to handle that properly if hasattr(instance, 'plaintext'): for key, value in all_keywords: if key == 'unknown': instance.plaintext += ' ' + value
def get_hidden_states(self): # TODO: documentation """ """ ## reset rnn states self.model.reset_states() ## predict test set labels = self.model.predict(self.input, batch_size=BATCH_SIZE) ## extract states for all sequences accross all chars. ex = extractor.Extractor(self.model, [0]) states = ex.get_states(self.input, batch_size=BATCH_SIZE, \ unshuffle=True) states_perchar = states.reshape( self.input.shape[0] * self.input.shape[1], -1) return states_perchar
def process_code(self, code): E = extractor.Extractor([ code, ], self) E.extract() if len(self.years) > 1: max_year = max(self.years) else: max_year = max(self.years) + 1 # if self.aggregate_visits: # aggregate_visits.aggregate_events(code, E.get_inpatient_visits(code), self.identifier, self.result_dir,reduce_mode_mini=False) # if self.aggregate_readmits: # unlinked_nodes, nodes, edges = E.get_readmit_nodes_edges(code) # aggregate_edges.readmits.aggregate_readmits(code, unlinked_nodes, nodes, edges, self.identifier, self.result_dir,max_year) # if self.aggregate_revisits: # subsets = E.get_revisit_nodes_edges(code) # for k, v in subsets.iteritems(): # aggregate_edges.revisits.aggregate_revisits(k, v['unlinked_nodes'], v['nodes'], v['edges'], self.identifier, self.result_dir, max_year) # if self.aggregate_patients: # aggregate_patients.aggregate_patients(code,E.get_patients(code),self.identifier,self.result_dir,self.patients) raise NotImplementedError
def pairwiseLearning(minimal=False): # Feature extraction ext = extractor.Extractor() ext.consonantPrep = rdr.consonants ext.soundClassPrep = rdr.soundClasses if minimal: ext.appendWordSimilarityFeatures(prr.examples, prr.labels, ext.minimalMeasures) ext.appendPOSTags(prr.examples, prr.labels, rdr.POSTags) else: ext.appendWordSimilarityFeatures(prr.examples, prr.labels, [ ext.commonBigramRatio, ext.commonTrigramNumber, ext.bigramDice, ext.jaroDistance ]) ext.appendWordSimilarityFeatures(prr.examples, prr.labels, [ext.identicalWords], rdr.consonants) ext.appendWordSimilarityFeatures( prr.examples, prr.labels, [ext.LCPLength, ext.commonBigramNumber, ext.identicalPrefix], rdr.soundClasses) ext.appendPOSTags(prr.examples, prr.labels, rdr.POSTags) ext.appendLetterFeatures(prr.examples, prr.labels) ext.appendSameLanguageGroupFeatures(prr.examples, prr.labels) # Learning lrn, predictions = learn(ext, 0.0001) # Reporting stage = "Pairwise Learning" accuracy = lrn.computeAccuracy(ext.testLabels, predictions) F1 = lrn.computeF1(ext.testLabels, predictions) report = lrn.evaluatePairwise(ext.testLabels, predictions) output.reportPairwiseLearning(stage, prr, accuracy, F1, report) output.savePredictions("output/" + stage + ".txt", prr.examples[constants.TEST], ext.testExamples, predictions, ext.testLabels) return ext, lrn
def on_idle(self): while (not self.in_q.empty()): self.icon_color = ( self.icon_color + .01 ) % 1 #resets to black when icon is green since 1.0 and 0.0 %1 are equal temp = self.in_q.get() if type(temp) is list: #draw contours contours = temp[0] color = temp[1] primary_label = temp[2] normals = temp[3] self.make_display_lists(contours, color / 255.0, primary_label, normals) self.draw() else: #add new extractor args = temp.split() location = (int(args[0]), int(args[1]), int(args[2])) primary_id = [] secondary_ids = [] split_str = re.split(":", args[3]) primary_id = [int(split_str[0])] if split_str[1] != "": secondary_ids = [ int(label) for label in re.split(',', split_str[1]) ] ids = [primary_id + secondary_ids] extr = extractor.Extractor(self.in_q, self.directory, ids, location, max_x, max_y) extracting_worker = threading.Thread(target=extr.run, name="extr") extracting_worker.daemon = True extracting_worker.start() self.st = time.time() if time.time() - self.st > 0.5: self.icon_color = np.array((0.0, 1.0, 0.0)) self.draw()
def makeReq(self): self.ensureHtmlDirExists() with requests.Session() as c: try: res = c.post(self.targetUrl, data=self.loginPayload, headers={ "Referer": "https://slcm.manipal.edu/loginForm.aspx" }) if res.url.endswith('loginForm.aspx'): #login failure self.loginError = True else: #login success self.loginError = False if os.path.isdir(self.htmlSavePath + self.username) == False: os.mkdir(self.htmlSavePath + self.username) if self.efficient == False: homePageCode = c.get( 'https://slcm.manipal.edu/studenthomepage.aspx') self.saveHtmlFile(homePageCode, '_homepage') self.gatherHtmlFiles(c) # else: # self.loginError = True except: self.collectionError = True if self.loginError == False and self.collectionError == False: print('[C] Launching extractor...') newData = extractor.Extractor(self.username, self.password) newData.scrapeEverything() self.attendanceData = newData.attendanceData self.marksData = newData.marksData if newData.extractionError == True: self.errorDuringExtraction = True else: if self.loginError: print('[C] [ERROR] Login Error') if self.collectionError: print('[C] [ERROR] Collection Error')
acts_file = "../../model/simple_rnn_2layers.pt" elif config_str == "rnn3": config = RNN_CONFIG3 acts_file = "../../model/simple_rnn_2layers_bidir.pt" elif config_str == "lstm1": config = LSTM_CONFIG1 acts_file = "../../model/lstm.pt" elif config_str == "lstm2": config = LSTM_CONFIG2 acts_file = "../../model/lstm_2layers.pt" elif config_str == "lstm3": config = LSTM_CONFIG3 acts_file = "../../model/lstm_2layers_bidir.pt" else: raise ("NOT A VALID CONFIG.") config.output_dim = 2 model = BinarySARNN(config) model.load_state_dict(torch.load(acts_file)) model.to(DEVICE) with open(os.path.join("../../reviews", input_file)) as f: inpt = f.read() json_fname = config_str + "_" + input_file[:-4] + ".json" output = rnndissect.utils.model_utils.classify(model, inpt) ex = extr.Extractor(config, model) ex.activations_to_json(inpt, json_fname)
This file is part of libextractor. (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. libextractor is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with libextractor; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Little demo how to use the libextractor Python binding. """ import extractor import sys xtract = extractor.Extractor() for arg in sys.argv[1:]: print "Keywords from %s:" % arg keys = xtract.extract(arg) for keyword_type, keyword in keys: print "%s - %s" % (keyword_type.encode('iso-8859-1'), keyword.encode('iso-8859-1'))
def extract(self): ## extract states from the model trained through the test set self.rnn.reset_states() print("predicting labels...") ## predictions to label states positive and negative labels = self.rnn.predict(self.data, batch_size=BATCH_SIZE) ## extract hidden states for all sequences across all chars. ex = extractor.Extractor(self.rnn, [0]) states = ex.get_states(self.data, batch_size=BATCH_SIZE, \ unshuffle=True) states_perchar = states.reshape(self.data.shape[0]*self.data.shape[1],-1) ## clustering using k-means print("Clustering...") kmeans = KMeans(n_clusters=self.k_num, random_state=0) kmeans.fit(states_perchar) print("Complete.") print("Inertia: {}".format(kmeans.inertia_)) ####################################################################### # search for first occurrence of each state in kmeans.labels_ # TODO: randomize, and make it pass thorugh all examples first_occur = self.find_first(kmeans.labels_) # indices correspond to the index in 250000,16, change first_occur to seq_num and char_num first_seq_char = {element: [first_occur[element]//50, first_occur[element]%50] for element in first_occur} ####################################################################### ## create state objects and visited/unvisited markers states = [State("s{}".format(i)) for i in range(self.k_num)] visited = [False]*self.k_num ## initializing variables for BFS current_state_num = 0 visited[0] = True q = deque() q.append(states[0]) # create DFA object dfa = DFA() dfa.set_omega(self.alphabet) dfa.set_Q(states) dfa.set_q_0(states[0]) prev_input = None counter = 0; threshold = 100 while q: print("Iteration # {}".format(counter)) print("------------------------------") counter += 1 if counter > threshold: raise Warning("while loop doesn't seem to exit") current_state = q.popleft() current_state_num = current_state.get_state_num() # label of the timestep is label_state = labels[first_occur[current_state_num]] # TODO: label_state is a double -- may have to round? if label_state == 1: dfa.add_F(current_state) # locate the first occurrence of current_state seq_num = first_seq_char[current_state_num][0] char_num = first_seq_char[current_state_num][1] sofar_str = self.data[seq_num][:char_num+1] # sofar_str should lead to the state being observed # this is naive -- because of padding. prev_input = sofar_str[-1] # this is the last letter in the previous sequence print("prev character = {}".format(self.arr2char(prev_input.tolist()))) for i in self.next_alphabet(prev_input): print("Trying character {}".format(self.arr2char(i.tolist()))) # add i at the back of the test sequence new = np.vstack((sofar_str, i)) len_string = len(i) # new will not be 50 in length anymore. This can be remedied by # deleting the first len(i) elements, given that they are X's if new.shape[0] > 50: if any([ all(element == np.array([1,0,0,0,0])) for element in new[:len_string]]): new = new[len_string:] else: raise ValueError("The sequence doesn't start with enough X's") elif new.shape[0] < 50: insert = np.tile(np.array([1,0,0,0,0]), (50 - new.shape[0],1)) new = np.vstack((insert, new)) # ok... so this part is super inefficient, but i think it works. train_copy = self.data[seq_num:seq_num + BATCH_SIZE].copy() train_copy[0] = new # plug sequence into RNN model new_states = ex.get_states(train_copy, batch_size=BATCH_SIZE, unshuffle=True) # feature extraction at position first_occurrance +1 next_s = new_states[0,-16:] # kmeans classification next_state_num = kmeans.predict(next_s)[0] # next_state is an int next_state = states[next_state_num] # if the next state has already been visited, add_delt, but # don't enqueue. trigger_char = self.arr2char(i.tolist()) current_state.add_delt(trigger_char, next_state) # linking current_state to a State object dfa.add_delta(current_state, trigger_char, next_state) if not visited[next_state_num]: # enqueue q.append(next_state) # process of adding to DFA object return dfa, states
def train(self, items, limit=None): self.clusters = {} self.noise = [] items = list(items) # Extract the features we want to use for clustering from the items self.extractor = extractor.Extractor() self.features = self.extractor.fit_transform(items, limit=limit) if self.verbose: sys.stderr.write("{0}: Items to cluster\n".format( len(self.features))) jobs = os.cpu_count() or -1 start = time.perf_counter() # Initialize the NCD code with our log feature. Currently only # one feature is used: the normalized log X = ncd.prepare( map(lambda features: features[extractor.FEATURE_LOG], self.features)) # Calculate all the pairwise distances between the items in question # The scikit DBSCAN implementation does this anyway, poorly. So why not # do it ahead of time and parralelize it ... which we do here. Then we # # TODO: This takes forever and is an O(n^2) operation # There is significant room for improvement both here, and in the following # DBSCAN usage and implementation. Techniques such as feature/item selection # BIRCH, ball trees, or many other things could make this better/faster matrix = sklearn.metrics.pairwise.pairwise_distances(X, metric=ncd.metric, state=ncd.state, n_jobs=jobs) if self.verbose: sys.stderr.write( "{0}: Computed distances in {1} seconds on {2} cores\n".format( int((len(self.features) * len(self.features)) / 2), int(time.perf_counter() - start), jobs)) # Actually perform the clustering. This is fast compared to above min_samples = min(self.min_samples, len(self.features) / 10) dbs = sklearn.cluster.DBSCAN(metric='precomputed', eps=self.eps, min_samples=min_samples) dbs.fit(matrix) labels = dbs.labels_ # Create clusters of all the items clusters = {} noise = [] for i, label in enumerate(labels): if label == -1: noise.append(i) else: if label not in clusters: clusters[label] = [] clusters[label].append(i) self.clusters = {} for label, indexes in clusters.items(): self.clusters[label] = Cluster(label, indexes) self.noise = Cluster(None, noise) # Print out a rough description of that if self.verbose: sys.stderr.write("{0}: Clusters ({1} items, {2} noise)\n".format( len(self.clusters.keys()), len(self.features) - len(noise), len(noise))) # Setup our neighbors classifier for predict() self.neighbors = sklearn.neighbors.KNeighborsClassifier( metric='precomputed', weights='distance') self.neighbors.fit(matrix, labels) # Collapse the high dimensionality of the matrix data # # HACK: scikit-learn has a parallelization bug where pairwise_distances # returns a non-symmetric array for certain floats when using loky (default) # parallelization. The MDS call requires a symmetric array up to E-10 matrix = sklearn.utils.validation.check_symmetric(matrix) self.squashed = sklearn.manifold.MDS( n_components=2, dissimilarity='precomputed').fit_transform(matrix)
max_x = int(sys.argv[4]) max_y = int(sys.argv[5]) ids = [] for label_set in (sys.argv[6:len(sys.argv)]): primary_id = [] secondary_ids = [] split_str = re.split(":", label_set) primary_id = [int(split_str[0])] if split_str[1] != "": secondary_ids = [ int(label) for label in re.split(',', split_str[1]) ] ids += [primary_id + secondary_ids] extr = extractor.Extractor(display_queue, directory, ids, location, max_x, max_y) viewer = Viewer(location, display_queue, directory, max_x, max_y) handler = handler.Input_Handler(display_queue) viewer.set_dimensions(extr.rows, extr.columns, extr.layers, extr.w) extracting_worker = threading.Thread(target=extr.run, name="extr") input_worker = threading.Thread(target=handler.run, name="input_worker") input_worker.daemon = True extracting_worker.daemon = True extracting_worker.start() input_worker.start() viewer.main()
import extractor import visualizer sims = extractor.Extractor("file path here") sims.analyze(n=10, topics=25)
def guess_mimetype(filename=None): """Guess mime type of arbitrary file. filenames are supposed to be in Unicode """ worst_case = "application/octet-stream" _log.debug('guessing mime type of [%s]', filename) # 1) use Python libextractor try: import extractor xtract = extractor.Extractor() props = xtract.extract(filename=filename) for prop, val in props: if (prop == 'mimetype') and (val != worst_case): return val except ImportError: _log.debug( 'module <extractor> (python wrapper for libextractor) not installed' ) except OSError as exc: # winerror 126, errno 22 if exc.errno != 22: raise _log.exception( 'module <extractor> (python wrapper for libextractor) not installed' ) ret_code = -1 # 2) use "file" system command # -i get mime type # -b don't display a header mime_guesser_cmd = 'file -i -b "%s"' % filename # this only works on POSIX with 'file' installed (which is standard, however) # it might work on Cygwin installations aPipe = os.popen(mime_guesser_cmd, 'r') if aPipe is None: _log.debug("cannot open pipe to [%s]" % mime_guesser_cmd) else: pipe_output = aPipe.readline().replace('\n', '').strip() ret_code = aPipe.close() if ret_code is None: _log.debug('[%s]: <%s>' % (mime_guesser_cmd, pipe_output)) if pipe_output not in ['', worst_case]: return pipe_output.split(';')[0].strip() else: _log.error('[%s] on %s (%s): failed with exit(%s)' % (mime_guesser_cmd, os.name, sys.platform, ret_code)) # 3) use "extract" shell level libextractor wrapper mime_guesser_cmd = 'extract -p mimetype "%s"' % filename aPipe = os.popen(mime_guesser_cmd, 'r') if aPipe is None: _log.debug("cannot open pipe to [%s]" % mime_guesser_cmd) else: pipe_output = aPipe.readline()[11:].replace('\n', '').strip() ret_code = aPipe.close() if ret_code is None: _log.debug('[%s]: <%s>' % (mime_guesser_cmd, pipe_output)) if pipe_output not in ['', worst_case]: return pipe_output else: _log.error('[%s] on %s (%s): failed with exit(%s)' % (mime_guesser_cmd, os.name, sys.platform, ret_code)) # If we and up here we either have an insufficient systemwide # magic number file or we suffer from a deficient operating system # alltogether. It can't get much worse if we try ourselves. _log.info("OS level mime detection failed, falling back to built-in magic") import gmMimeMagic mime_type = gmTools.coalesce(gmMimeMagic.filedesc(filename), worst_case) del gmMimeMagic _log.debug('"%s" -> <%s>' % (filename, mime_type)) return mime_type
def extract(self): ## extract states from the model trained through the test set self.rnn.reset_states() print("predicting labels...") ## predictions to label states positive and negative labels = self.rnn.predict(self.data, batch_size=BATCH_SIZE) ## extract hidden states for all sequences across all chars. ex = extractor.Extractor(self.rnn, [0]) states = ex.get_states(self.data, batch_size=BATCH_SIZE, \ unshuffle=True) states_perchar = states.reshape( self.data.shape[0] * self.data.shape[1], -1) ## clustering using k-means print("Clustering...") kmeans = KMeans(n_clusters=self.k_num, random_state=0) kmeans.fit(states_perchar) print("Complete.") print("Inertia: {}".format(kmeans.inertia_)) ## finding the sequence & char number in testing data corresponding ## to every state occur = self.find_occurance(kmeans.labels_) ## cut down list naively -- may replace with randomized sampling for state in occur: threshold = int(FRACTION * len(occur[state])) replacement = [] for i in range(threshold): replacement.append(occur[state][i]) occur[state] = replacement ## Instantiate dfa object dfa = DFA() dfa.set_omega(self.alphabet) dfa.set_Q(self.k_labels) dfa.initiate_delta() # TODO: set q_0? Also not sure about F ###################################################################### ## create state objects and visited/unvisited markers # states = [State("s{}".format(i)) for i in range(self.k_num)] # visited = [False]*self.k_num # ## initializing variables for BFS # current_state_num = 0 # visited[0] = True # q = deque() # q.append(states[0]) # # create DFA object # dfa = DFA() # dfa.set_omega(self.alphabet) # dfa.set_Q(states) # dfa.set_q_0(states[0]) ###################################################################### prev_input = None total_processes = sum([len(element) for element in occur.values()]) print("{} total processes to be run".format(total_processes)) import ipdb ipdb.set_trace() block = total_processes / 50.0 counter = 0 for cluster in occur.keys(): # occur[cluster] holds list of tuples instances = occur[cluster] for (seq_num, char_num) in instances: ## Display loading bar: counter += 1 print("#" * int(counter / block) + " " * (50 - int(counter / block)) + "| {} / {}".format(counter, total_processes)) ## Find the corresponding section of the test data chopped = self.data[seq_num][:char_num + 1] prev_input = chopped[-1] print("prev character: {}".format( self.arr2char(prev_input.tolist()))) for i in self.next_alphabet(prev_input): print("trying character: {}".format( self.arr2char(i.tolist()))) new_seq = np.vstack((prev_input, i)) if new_seq.shape[0] > 50: over = new_seq.shape[0] - 50 ## check if the first "over" elements are X's, if not, ## this is an unsuable example if all([ np.array_equal(new_seq[i], np.array([1, 0, 0, 0, 0])) for i in range(over) ]): new_seq = new_seq[over:] else: print("".join( [self.arr2char(row) for row in new_seq]) + " does not contain enough X's") pass # skip out of for loop elif new_seq.shape[0] < 50: over = 50 - new_seq.shape[0] insert = np.tile(np.array([1, 0, 0, 0, 0]), (over, 1)) new_seq = np.vstack((insert, new_seq)) ## putting sequence into block of size BATCH_SIZE train_copy = self.data[seq_num:seq_num + BATCH_SIZE].copy() train_copy[0] = new_seq ## Find hidden state new_states = ex.get_states(train_copy, batch_size=BATCH_SIZE, unshuffle=True) next_s = new_states[0, -16:] ## using SVM to predict the quantized state for hidden state next_state_num = kmeans.predict(next_s[np.newaxis, :])[0] ## increment the (state1, transition, state2) in dfa in delta function transition = self.arr2char(i.tolist()) dfa.add_delta(cluster, transition, next_state_num) # convert counts in dfa.delta to probabilities. dfa.delta_count2prob() return dfa