def test_unit_bytes_per_second(self): ft = Featurizer() ratio_1 = ft.bytes_per_second(0, 0) self.assertEqual(0, ratio_1) ratio_2 = ft.bytes_per_second(5, 0) self.assertEqual(5, ratio_2) ratio_3 = ft.bytes_per_second(5, 5) self.assertEqual(1, ratio_3)
def featurize_data(self, data, models): logging.warning('featurizing train...') f = Featurizer(self.conf) sample, labels = f.featurize(data, models) self.labels = array(labels) # get word pairs and headers self.header, self.words = f.convert_to_wordpairs(sample) logging.info('HEADERS: {0}'.format(self.header)) # print [s.features for s in sample] logging.info('converting table...') self.data = f.convert_to_table(sample) logging.info('data shape: {0}'.format(self.data.shape)) logging.info('labels shape: {0}'.format(self.labels.shape)) self.feats = f._feat_order
def worker_task(files, args, worker_id): featurizer = Featurizer() cooc = {} Counter = 0 # Number of articles processed file_count = 0 for inputfile in files: file_name, _ = splitext(basename(inputfile)) output_file = open(join(args.outputpath, file_name + ".cooc"), 'wb') sys.stdout.write("{}: Processing file:{}\n".format( worker_id, inputfile)) dic = {} with open(inputfile, "r") as F: read = 0 while True: try: text = F.read(4096) if len(text) == 0: break read += len(text) process_features(text, featurizer, cooc, args.window_size) except: break print "read:{}/{}, {}%".format(read, 1E9, read / float(1E9) * 100) #dump the gram info dump_cooc_to_file(worker_id, cooc, output_file) sys.stdout.write("{}: Finished processing file:{}\n".format( worker_id, inputfile)) file_count += 1 # clear up del dic
def test_unit_featurizer(self): ft = Featurizer() features = '0,0,0,0,1.0' key1 = '00:00:00:00:00:01_00:00:00:00:00:02_6_2' key2 = '00:00:00:00:00:02_00:00:00:00:00:01_6_2' key3 = '00:00:00:00:00:02_00:00:00:00:00:03_6_2' flow_dict = {} flow_dict[key1] = "00:00:00:00:00:01,00:00:00:00:00:02,6,2,8,8,0,480,480,0,1,0,1\n" flow_dict[key2] = "00:00:00:00:00:02,00:00:00:00:00:01,6,2,8,8,0,480,480,0,1,0,1\n" flow_dict[key3] = "00:00:00:00:00:02,00:00:00:00:00:03,6,2,8,8,0,480,480,0,1,0,1\n" stat = ['00:00:00:00:00:01','00:00:00:00:00:02','6','2','0','0','0','0','0','0','1','0','0'] self.assertEqual(features,ft.featurizer(stat, flow_dict))
def worker_task(files, args, worker_id): labeler = FeatureLabelerHungry() featurizer = Featurizer(labeler=labeler) Counter = 0 # Number of articles processed file_count = 0 for inputfile in files: sys.stdout.write("{}: Processing file:{}\n".format( worker_id, inputfile)) dic = {} with open(inputfile, "r") as F: # All articles begin with '<doc' and end with '</doc>' # for line in F: # if line.startswith("<doc"): # continue # if line.startswith("</doc>"): # # some paragraph ends # Counter += 1 # continue read = 0 it = 0 while True: try: # filter_with_alphabet(sanitize_line(F.read(1024)), args.alphabet) text = F.read(4096) if len(text) == 0: break read += len(text) features = featurizer.featurize(text) labeler.increment_features(features) except: break print "read:{}/{}, {}%".format(read, 1E9, read / float(1E9) * 100) #dump the gram info file_name, _ = splitext(basename(inputfile)) output_file = join(args.outputpath, file_name + ".fcount") labeler.dump(output_file) sys.stdout.write("{}: Finished processing file:{}\n".format( worker_id, inputfile)) file_count += 1 # clear up del dic
def test_unit_pair_flow_ratio(self): ft = Featurizer() key1 = '00:00:00:00:00:01_00:00:00:00:00:02_6_2' key2 = '00:00:00:00:00:02_00:00:00:00:00:01_6_2' key3 = '00:00:00:00:00:02_00:00:00:00:00:03_6_2' flow_dict = {} flow_dict[key1] = "00:00:00:00:00:01,00:00:00:00:00:02,6,2,8,8,0,480,480,0,20,0,1\n" flow_dict[key2] = "00:00:00:00:00:02,00:00:00:00:00:01,6,2,8,8,0,480,480,0,10,0,1\n" flow_dict[key3] = "00:00:00:00:00:02,00:00:00:00:00:03,6,2,8,8,0,480,480,0,5,0,1\n" stat = ['00:00:00:00:00:01','00:00:00:00:00:02','6','2','8','8','0','480','480','0','20','0','1\n'] stat1 = ['00:00:00:00:00:02','00:00:00:00:00:03','6','2','8','8','0','480','480','0','5','0','1\n'] ratio_1 = ft.pair_flow_ratio(stat, flow_dict) self.assertEqual(2.0, ratio_1) ratio_2 = ft.pair_flow_ratio(stat1, flow_dict) self.assertEqual(5, ratio_2)
def worker_task(files, args, worker_id): featurizer = Featurizer(Settings()) Counter = 0 # Number of articles processed file_count = 0 text = [] for inputfile in files: sys.stdout.write("{}: Processing file:{}\n".format( worker_id, inputfile)) dic = {} with open(inputfile, "r") as F: file_name, _ = splitext(basename(inputfile)) F_out = open(join(args.outputpath, file_name + ".featues"), "w") text = [] # All articles begin with '<doc' and end with '</doc>' for line in F: if line.startswith("<doc"): continue if line.startswith("</doc>"): # some paragraph ends featurizer.featurize(text) Counter += 1 if Counter % 1 == 0: sys.stdout.write( "{}: Finished processing article:{}\n".format( worker_id, Counter)) if Counter % 50 == 0: exit(0) text = [] continue text.extend( word_tokenize( filter_with_alphabet(sanitize_line(line), args.alphabet))) # F_out.write(str(featurizer.featurize(word_tokenize(filter_with_alphabet(sanitize_line(line), args.alphabet))))) # text.extend() # F_out.close() sys.stdout.write("{}: Finished processing file:{}\n".format( worker_id, inputfile)) file_count += 1 # clear up del dic
def main(): if opts.input == None: docs_in = sys.stdin else: docs_in = open(opts.input) if opts.output == None: scores_out = sys.stdout else: scores_out = open(opts.output, 'w') bundle = pickle.load(open(opts.model)) clf = bundle['clf'] dv = bundle['dv'] ftzr = Featurizer(parsecachepath=opts.cache, use=opts.features) if opts.preproc == 'nltk': preprocessor = NLTKPreprocessor() else: preprocessor = StanfordPreprocessor() for doc in docs_in: if doc.strip() == '': scores_out.write('\n') else: if opts.nosplit: avg, fstr = score(doc, ftzr, dv, clf) out = '%s' % avg if opts.dump: out += '\t%s' % fstr else: sentences = preprocessor.parse(doc)['sentences'] avg, scores, fstrs = score_doc(sentences, ftzr, dv, clf) out = '%s' % avg if opts.sentscores: out += '\t%s' % (','.join(['%f' % s for s in scores])) if opts.dump: out += '\t%s' % ','.join(fstrs) scores_out.write('%s\n' % out) scores_out.close() ftzr.close()
def synthesizeUniqueFeatures(self, intBaseFeat, boolBaseFeat, baseFeatureValues, exclude): syntFeats: Tuple[ PrecisFeature] = self.featureSynthesizer.synthesizeFeatures( intBaseFeat, boolBaseFeat, baseFeatureValues) # if boolBaseFeat empty, no derived bool features will be generated -> consider refactor genFeats: Tuple[ PrecisFeature] = self.featureSynthesizer.GenerateDerivedFeatures( intBaseFeat, boolBaseFeat) derivFeats: Tuple[ PrecisFeature] = Featurizer.mergeSynthesizedAndGeneratedFeatures( syntFeats, genFeats) uniqueDerivFeats = tuple([f for f in derivFeats if f not in exclude]) return uniqueDerivFeats
def worker_task(files, args, worker_id): featurizer = Featurizer() cooc = {} # in format (word1, word2) : count Counter = 0 # Number of articles processed file_count = 0 text = [] for inputfile in files: tokens_count = 0 file_name, _ = splitext(basename(inputfile)) F_out = open(join(args.outputpath, file_name + ".cooc_chunked"), 'wb') sys.stdout.write("{}: Processing file:{}\n".format( worker_id, inputfile)) with open(inputfile, "r") as F: text = [] chars = 0 # All articles begin with '<doc' and end with '</doc>' for line in F: if line.startswith("<doc"): continue if line.startswith("</doc>"): # some paragraph ends tokens_count += process(" ".join(text), featurizer, cooc, args.window_size) text = [] chars = 0 Counter += 1 if Counter % 500 == 0: sys.stdout.write( "{}: Finished processing article:{}\n".format( worker_id, Counter)) dump_cooc_to_file(worker_id, cooc, F_out) cooc = {} continue text.append(line) # Cannot be longer than 100000 chars += len(line) if chars > 10000: tokens_count += process_features(" ".join(text), featurizer, cooc, args.window_size) text = [] chars = 0 dump_cooc_to_file(worker_id, cooc, F_out) cooc = {} F_out.close() sys.stdout.write("{}: Finished processing file:{}: {} tokens\n".format( worker_id, inputfile, tokens_count)) file_count += 1
def test_unit_packet_pair_ratio(self): ft = Featurizer() ratio_1 = ft.packet_pair_ratio(0, 0) self.assertEqual(0, ratio_1) ratio_2 = ft.packet_pair_ratio(5, 0) self.assertEqual(0, ratio_2) ratio_3 = ft.packet_pair_ratio(5, 5) self.assertEqual(1, ratio_3) ratio_4 = ft.packet_pair_ratio(0, 1) self.assertEqual(1, ratio_4)
def test_unit_bytes_per_packet(self): ft = Featurizer() ratio_1 = ft.bytes_per_packet(0, 0) self.assertEqual(0, ratio_1) ratio_2 = ft.bytes_per_packet(0, 5) self.assertEqual(0, ratio_2) ratio_3 = ft.bytes_per_packet(5, 5) self.assertEqual(1, ratio_3) ratio_4 = ft.bytes_per_packet(1, 0) self.assertEqual(0, ratio_4)
if __name__ == "__main__": parser = argparse.ArgumentParser( description='For given wikipedia dump files, \ generate dump of article n-gram statistics') parser.add_argument('-e', '--embedding', type=str, help='path to embedding txt') parser.add_argument('--raw', action='store_true') args = parser.parse_args() print args with tf.Graph().as_default(): config = Config() featurizer = Featurizer() model = Model(config) loader = Loader(featurizer) if args.raw: # prep data loader.load_raw() exit(0) embeddings, embed_size = featurizer.labeler.load_embedding( args.embedding) # embed_size = 100 config.dim_embedding = embed_size loader.load() # model.add_embeddings() model.add_embeddings(embeddings)
class FlowCleaning: featurizer = Featurizer() ############################################################################# # flow_stat_clean(live, batch_number, poll_dur) # # Function to handle the full cleaning process # Either cleans all training batches # or cleans the batch with specified batch number for live classification # # Args: # live: boolean True if live classification or False for training # batch_number: batch_number to specify file to clean for live classification # poll_dur: polling duration of stats used in feature generation # # Outputs cleaned flow batch to a new .csv file ready for classification # def flow_stat_clean(self, live, batch_number, poll_dur): if not live: file_num = 1 # Clean all files in the training directory while (os.path.isfile("Neptune/stats_training/output" + str(file_num) + ".csv")): flow = "Neptune/stats_training/output" + str(file_num) + ".csv" target = "Neptune/stats_training/output" + str( file_num) + "_target.txt" try: flow_stats = open(flow, 'r') flow_target = open(target, 'r') except: logging.error('Unable to open stats and target files') batch_agg = self.batch_aggregate(flow_stats, flow_target, False) clean_dir = "Neptune/stats_training/output" + str( file_num) + "_cleaned.csv" target_dir = "Neptune/stats_training/output" + str( file_num) + "_target_cleaned.txt" self.batch_cleaning(clean_dir, target_dir, batch_agg, False, poll_dur) file_num += 1 else: file_num = batch_number flow_dir = "Neptune/stats_live/output" + str(file_num) + ".csv" try: flow_stats = open(flow_dir, 'r') except: logging.error('Unable to open: ' + str(flow_dir)) batch_agg = self.batch_aggregate(flow_stats, -1, True) clean_dir = "Neptune/stats_live/output" + str( file_num) + "_cleaned.csv" self.batch_cleaning(clean_dir, -1, batch_agg, True, poll_dur) ############################################################################# # batch_cleaning(clean_dir, target_dir, batch_agg, live, poll_dur) # # Ouputs the cleaned stats with new features to the appropriate file # # Args: # clean_dir: directory for the output of cleaned stats # target_dir: directory for the adjusts target/ground truth values # batch_agg: dictionary of cleaned statistics # live: boolean True if live classification # poll_dur: polling duration of stats used in feature generation # def batch_cleaning(self, clean_dir, target_dir, batch_agg, live, poll_dur): try: flow_cleaned = open(clean_dir, 'w') except: logging.error('Unable to open flow_cleaned file') # Final cleaned feature labels flow_cleaned.write( "eth_src,eth_dst,ip_proto,state_flag,pkts,src_pkts,dst_pkts,bytes,src_bytes,dst_bytes," + "pkts_per_sec,bytes_per_second,bytes_per_packet,packet_pair_ratio,pair_flow\n" ) if not live: try: target_cleaned = open(target_dir, 'w') target_cleaned.write("target\n") except: logging.error('Unable to open target cleaned file') # Generate new features for each flow and write each flow stat to file for i in batch_agg: stat = batch_agg[i].split(",") stat[11] = int(poll_dur) features = self.featurizer.featurizer(stat, batch_agg) flow_cleaned.write("{},{},{},{},{},{},{},{},{},{},{}\n".format( stat[0], stat[1], stat[2], stat[3], stat[4], stat[5], stat[6], stat[7], stat[8], stat[9], features)) if not live: if '\n' not in str(stat[11]): target_cleaned.write("{}".format(stat[12])) else: target_cleaned.write("{}".format(stat[12])) ############################################################################# # batch_aggregate(flow_stats, flow_target, live) # # Aggregates flows with same src and dst and same protocols together # This provides statistics on an eth_src->eth_dst basis # Also generates new flow values to enable further features to be calculated # by the Featurizer class # # Args: # flow_stats: array of flow stat records # flow_target: array of ground truth values corresponding to flow stats # # Returns: # batch_dict: dictionary of aggregated flow statistics using src, dst # and protocol as unique key values # def batch_aggregate(self, flow_stats, flow_target, live): if not live: target_lines = flow_target.readlines() batch_dict = {} line_number = 0 first_line_flag = True clean_calc = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] flow_stats.seek(0) for line in flow_stats: if first_line_flag: first_line_flag = False line_number += 1 continue stats = line.split(",") if str(stats[6]) == 'man': continue if 's' in str(stats[15]): state_flag = 2 else: state_flag = -1 key = (str(stats[0]) + "_" + str(stats[1]) + "_" + str(stats[6]) + "_" + str(state_flag)) target = -1 if not live: target = str(target_lines[line_number]) # If flow stat exists in dictionary, aggregate the counted values such as pkt_count if key in batch_dict: old_stats = batch_dict[key].split(",") for i in range(len(old_stats)): if i >= 4 and i <= 9: clean_calc[i] = int(stats[i + 4]) + int(old_stats[i]) elif i == 10: clean_calc[i] = int(old_stats[i]) + 1 # Set target value based on previous flow stat if old_stats[len(old_stats) - 1] == 1 and target == 0: target = 1 batch_dict[key] = ( str(stats[0]) + "," + str(stats[1]) + "," + str(stats[6]) + "," + str(state_flag) + "," + str(clean_calc[4]) + "," + str(clean_calc[5]) + "," + str(clean_calc[6]) + "," + str(clean_calc[7]) + "," + str(clean_calc[8]) + "," + str(clean_calc[9]) + "," + str(clean_calc[10]) + "," + str(0) + "," + str(target)) else: batch_dict[key] = (str(stats[0]) + "," + str(stats[1]) + "," + str(stats[6]) + "," + str(state_flag) + "," + str(stats[8]) + "," + str(stats[9]) + "," + str(stats[10]) + "," + str(stats[11]) + "," + str(stats[12]) + "," + str(stats[13]) + "," + str(1) + "," + str(0) + "," + str(target)) line_number += 1 return batch_dict ############################################################################# # aggregate_stats(dir) # # Aggregates all individual cleaned stat files in the training directory # into one file to train on # # Outputs aggregated flow file and target file # def aggregate_stats(self, dir): flow_stats = open(dir + "FlowStats_cleaned.csv", "w") flow_target = open(dir + "FlowStats_target_cleaned.txt", "w") # Process first file and include header labels file_num = 1 for line in open(dir + "output" + str(file_num) + "_cleaned.csv"): flow_stats.write(line) for line in open(dir + "output" + str(file_num) + "_target_cleaned.txt"): flow_target.write(line) file_num += 1 # Process remainder, excluding label headers while (os.path.isfile(dir + "output" + str(file_num) + "_cleaned.csv")): flow = open(dir + "output" + str(file_num) + "_cleaned.csv") target = open(dir + "output" + str(file_num) + "_target_cleaned.txt") flow.next() target.next() for line in flow: flow_stats.write(line) for line in target: flow_target.write(line) file_num += 1 flow_stats.close()
'description', 'cross', 'north', 'south', 'east', 'west', '-PRON-', 'pron', 'nee', 'regard', 'shall', 'use', 'win', 'park', 'point', 'biking', 'follow', 'single', 'track', 'intersection', 'trailhead', 'head', 'good', 'great', 'nice', 'time', 'include', 'place', 'come', 'downhill', 'look', 'near' ]) bitri_stops = set([ 'parking_lot', 'trail_starts', 'mile_turn', 'north_south', 'mountain_bike', 'mountain_biking', 'single_track', 'mountain_bike_trail', 'trail_head' ]) second_stopwords = my_stopwords.union(STOPWORDS).union(bitri_stops) # Gensim LDA st_featurizer = Featurizer(first_stopwords=first_stopwords, second_stopwords=second_stopwords, bigrams=True, trigrams=True) processed_docs = st_featurizer.featurize(X) bow_corpus, id2word = make_gensim_bow(processed_docs, no_below=3, no_above=0.6, keep_n=10000) k = 6 lda_model = LdaMulticore(bow_corpus, num_topics=k, id2word=id2word, passes=5, workers=2, iterations=100) perplexity, coherence = get_perplexity_coherence(lda_model, bow_corpus,
def process_dataset(core_path, refined_path, dataset_name, output_path, cutoff): core_set_list = [x for x in os.listdir(core_path) if len(x) == 4] refined_set_list = [x for x in os.listdir(refined_path) if len(x) == 4] path = refined_path # atomic sets for long-range interactions atom_types = [6, 7, 8, 9, 15, 16, 17, 35, 53] atom_types_ = [6, 7, 8, 16] # atomic feature generation featurizer = Featurizer(save_molecule_codes=False) processed_dict = {} for name in tqdm(os.listdir(path)): if len(name) != 4: continue processed_dict[name] = gen_feature(path, name, featurizer) # interaction features processed_dict = pairwise_atomic_types(path, processed_dict, atom_types, atom_types_) # load pka (binding affinity) data pk_dict = load_pk_data(path + 'index/INDEX_general_PL_data.2016') data_dict = processed_dict for k, v in processed_dict.items(): v['pk'] = pk_dict[k] data_dict[k] = v refined_id, refined_data, refined_pk = [], [], [] core_id, core_data, core_pk = [], [], [] for k, v in tqdm(data_dict.items()): ligand = (v['lig_fea'], v['lig_co'], v['lig_atoms'], v['lig_eg']) pocket = (v['pock_fea'], v['pock_co'], v['pock_atoms'], v['pock_eg']) graph = cons_lig_pock_graph_with_spatial_context(ligand, pocket, add_fea=3, theta=cutoff, keep_pock=False, pocket_spatial=True) cofeat, pk = v['type_pair'], v['pk'] graph = list(graph) + [cofeat] if k in core_set_list: core_id.append(k) core_data.append(graph) core_pk.append(pk) continue refined_id.append(k) refined_data.append(graph) refined_pk.append(pk) # split train and valid train_idxs, valid_idxs = random_split(len(refined_data), split_ratio=0.9, seed=2020, shuffle=True) train_g = [refined_data[i] for i in train_idxs] train_y = [refined_pk[i] for i in train_idxs] valid_g = [refined_data[i] for i in valid_idxs] valid_y = [refined_pk[i] for i in valid_idxs] train = (train_g, train_y) valid = (valid_g, valid_y) test = (core_data, core_pk) with open(os.path.join(output_path, dataset_name + '_train.pkl'), 'wb') as f: pickle.dump(train, f) with open(os.path.join(output_path, dataset_name + '_val.pkl'), 'wb') as f: pickle.dump(valid, f) with open(os.path.join(output_path, dataset_name + '_test.pkl'), 'wb') as f: pickle.dump(test, f)
def learn3(self, k, intBaseFeat, boolBaseFeat, baseFeatureValues, exclude, call): #on the empty set of data points, return true if len(baseFeatureValues) == 0: print("called learn3 with 0 feature vectors") logger.info("called learn3 with 0 feature vectors") return PrecisFormula(BoolVal(False)) #rename splitIntoBoolAndIntFeatureVectors (intBaseFeatVectors, boolBaseFeatVectors) = Featurizer.getBoolAndIntFeatureVectors( intBaseFeat, boolBaseFeat, baseFeatureValues) derivFeats = self.synthesizeUniqueFeatures(intBaseFeat, boolBaseFeat, baseFeatureValues, exclude) derivFeatVectors: List[ FeatureVector] = Featurizer.generateDerivedFeatureVectors( derivFeats, intBaseFeat + boolBaseFeat, baseFeatureValues) #assert(len(baseFeatureValues) == len(derivFeatVectors)) boolFvs = Featurizer.mergeFeatureVectors(boolBaseFeatVectors, derivFeatVectors) houdini = Houdini() (allTrueFormula, indicesAllwaysTrue) = houdini.learn2(boolBaseFeat + derivFeats, boolFvs, call) logger.info("Houdini AlwaysTrue for k=" + str(k) + " : " + allTrueFormula.toInfix() + "\n") if k == 0: return allTrueFormula else: #removing features returned by houdini and their corresponding feature vector entries. (remainingBaseBoolFeat, remainingDerivBoolFeat, featuresRemoved) = \ self.removeFeatureFromFeaturelist(boolBaseFeat, derivFeats, indicesAllwaysTrue) (reaminingEntriesBaseBoolFv, reaminingEntriesDerivBoolFv) = \ self.removeFeatureEntryInFeatureVectors(boolBaseFeatVectors, derivFeatVectors, indicesAllwaysTrue) # features that are true on parent node should not be passed down to children;(they are redundantly also true in child nodes) exclude = exclude + featuresRemoved lookAhead = len(intBaseFeatVectors[0]) ###################################### #bug: chooseFeatureImplication does not update reamining bool features or feature vectors. Idx is with respect to updates (f,idx, posBaseFv, negBaseFv, remainingBaseBoolFeat, remainingDerivBoolFeat ) = \ self.chooseFeatureImplication(allTrueFormula,intBaseFeat,remainingBaseBoolFeat , remainingDerivBoolFeat, \ Featurizer.mergeFeatureVectors(intBaseFeatVectors,reaminingEntriesBaseBoolFv) , reaminingEntriesDerivBoolFv, lookAhead, call ) ###################################### if idx < 0: print("Predicate: " + call + " for k = " + str(k) + " : None") logger.info("Predicate: " + call + " for k = " + str(k) + " : None" + "\n") return allTrueFormula #TODO: choose should return boolBasePosFv and intBasePosFv ... #(f,idx, posBaseFv, negBaseFv) = \ # self.chooseFeature2(remainingBaseBoolFeat + remainingDerivBoolFeat, \ # Featurizer.mergeFeatureVectors(intBaseFeatVectors,reaminingEntriesBaseBoolFv), reaminingEntriesDerivBoolFv, call, lookAhead) logger.info("Predicate: " + call + " for k = " + str(k) + " : " + str(f) + "\n") print("Predicate chosen at " + call + " : " + str(f)) #featureSplitRemoved == f (newBoolBaseFeat, newDeriveBaseFeat, featureSplitRemoved) = \ self.removeFeatureFromFeaturelist(remainingBaseBoolFeat, remainingDerivBoolFeat, [idx]) # if predicate to split on is in derivedFeatures, then add to exclude list; if len(remainingBaseBoolFeat) == len(newBoolBaseFeat): exclude = exclude + (f, ) else: # if predicate to split is in baseFeatures, the update posBaseFv and negBaseFv feature vectors posBaseFv = self.removeFeatureEntryInBaseFv( posBaseFv, [idx + lookAhead]) negBaseFv = self.removeFeatureEntryInBaseFv( negBaseFv, [idx + lookAhead]) posPost = self.learn3( k-1,\ intBaseFeat, newBoolBaseFeat, posBaseFv, exclude, call + " Left") #recursive call logger.info(call + " Left: " + " for k = " + str(k) + " : " + posPost.toInfix()) print(call + " Left: " + " for k = " + str(k) + " : " + posPost.toInfix()) negPost = self.learn3( k-1,\ intBaseFeat, newBoolBaseFeat, negBaseFv, exclude, call +" Right") #recursive call logger.info(call + " Right: " + " for k = " + str(k) + " : " + negPost.toInfix()) print(call + " Right: " + " for k = " + str(k) + " : " + negPost.toInfix()) disjunctivePost = And( allTrueFormula.formulaZ3, Or(And(posPost.formulaZ3, f.varZ3), And(negPost.formulaZ3, Not(f.varZ3)))) precisPost = PrecisFormula(disjunctivePost) return precisPost
def learnPostUpToK(p, PUTName, outputFile, k, destinationOfTests): sygusExecutable = "Precis/Learners/EnumerativeSolver/bin/starexec_run_Default" tempLocation = "tempLocation" sygusFileName = "postcondition.sl" #assumes MSBuils.exe in path inst = Instrumenter( "MSBuild.exe", "./Instrumenter/Instrumenter/bin/Debug/Instrumenter.exe") p.ExtractObservers(PUTName, outputFile) # returns list of base features baseFeatures: Tuple[PrecisFeature] = p.ReadObserversFromFile(outputFile) allPostconditions = [] allBaseFeatureVectors = [] synthesizer = FeatureSynthesis(sygusExecutable, tempLocation, sygusFileName) currentPostcondition = PrecisFormula(BoolVal(False)) inst.instrumentPost(p, currentPostcondition, PUTName) rounds = 1 totalPexTime = 0.0 totalLearningTime = 0.0 while True: print("starting round: " + str(rounds)) pex = Pex() startTimePex = time.time() baseFeatureVectors: List[FeatureVector] = pex.RunTeacher( p, PUTName, baseFeatures) pexTime = time.time() - startTimePex totalPexTime += pexTime print("pex time: " + str(totalPexTime)) print("learning time: " + str(totalLearningTime)) evaluation.copyTestFilesToEvaluationDir(pex.testsLocation, destinationOfTests, rounds) #sys.exit(0) allBaseFeatureVectors.extend(baseFeatureVectors) if all(baseFeatureVectors[i].testLabel for i in range(0, len(baseFeatureVectors))): print("found it\n************************\n") simplifiedPost = PrecisFormula( currentPostcondition.precisSimplify()) return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len( allBaseFeatureVectors) # # Shambo: adding negetion checking # negPost = PrecisFormula(Not(currentPostcondition.formulaZ3)) # inst = Instrumenter( # "MSBuild.exe", "./Instrumenter/Instrumenter/bin/Debug/Instrumenter.exe") # inst.instrumentPost(p, negPost, PUTName) # negBaseFeatureVectors: List[FeatureVector] = pex.RunTeacher(p, PUTName, baseFeatures) # if len(negBaseFeatureVectors) == 0: # print ( "truly found it") # simplifiedPost = PrecisFormula(currentPostcondition.precisSimplify()) # return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len(allBaseFeatureVectors) # else: # print("fake found it") # for i in range(0,len(negBaseFeatureVectors)): # negBaseFeatureVectors[i].testLabel = "True" # baseFeatureVectors.extend(negBaseFeatureVectors) # allBaseFeatureVectors.extend(negBaseFeatureVectors) if rounds == 16: print("BAD!") simplifiedPost = PrecisFormula( currentPostcondition.precisSimplify()) return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len( allBaseFeatureVectors) if len(baseFeatureVectors) == 0: logger1.info( "process TERMINATED with TG not generating any test! DEBUG ME!\n" ) simplifiedPost = PrecisFormula( currentPostcondition.precisSimplify()) return currentPostcondition, simplifiedPost, rounds, totalPexTime, totalLearningTime, len( allBaseFeatureVectors) intBaseFeatures, boolBaseFeatures = Featurizer.getIntAndBoolFeatures( baseFeatures) disLearner = DisjunctiveLearner(synthesizer) logger1.info("#############\nRound: " + str(rounds) + "\n") # Learning function startLearningTime = time.time() postcondition = disLearner.learn3(k, intBaseFeatures, boolBaseFeatures, allBaseFeatureVectors, (), "root") learningTime = time.time() - startLearningTime totalLearningTime += learningTime logger1.info("unsimplified post:\n" + postcondition.toInfix() + "\n") print("unsimplified post " + postcondition.toInfix()) print("simplified post " + PrecisFormula(postcondition.precisSimplify()).toInfix()) # Shambo # Always insert simplified formula postcondition = PrecisFormula(postcondition.precisSimplify()) # assumes ms build in path inst = Instrumenter( "MSBuild.exe", "./Instrumenter/Instrumenter/bin/Debug/Instrumenter.exe") inst.instrumentPost(p, postcondition, PUTName) currentPostcondition = PrecisFormula(postcondition.formulaZ3) allPostconditions.append(postcondition.formulaZ3) rounds = rounds + 1
from featurizer import Featurizer,convert_lab from colorizer import Colorizer def get_grayscale(image): gray = cv2.cvtColor(image, cv.CV_BGR2GRAY) return cv2.merge((gray,gray,gray)) if __name__=="__main__": training_images = ["images/grass1.jpg","images/grass2.jpg"] test_image = skio.imread("images/grass3.jpg") #getting the right featurizer f = Featurizer(training_images) f.compute_k_means() print "Getting features..." f.compute_features() gray_test = get_grayscale(test_image) #getting the right colorizer colorizer = Colorizer(f) print "Starting Training of SVMs..." colorizer.train() #running the experiment print "Colorizing Image..." colored_image = colorizer.color_image(gray_test)
def main(): fearturizer = Featurizer() (options, args) = parser.parse_args() if not options.filename and not options.dir: parser.error('missing -f or -d option') if options.filename and options.dir: parser.error('please choose only one option to run feature extraction') if options.out: if not os.path.isdir(options.out): parser.error('invalid output path') else: fearturizer.set_out_dir(options.out) if options.filename: if not os.path.isfile(options.filename): parser.error("" + options.filename + " is not a file") else: fearturizer.set_in_path(options.filename) if options.dir: if not os.path.isdir(options.dir): parser.error("" + options.dir + " is not a directory") else: fearturizer.set_in_path(options.dir) fearturizer.set_restart(options.should_restart) fearturizer.prepare() fearturizer.run()
def chooseFeatureImplication(self, alwaysTrueFormula, intBaseFeatures, baseBoolFeatures, \ derivBoolFeatures, baseFv, derivFv, lookAhead, call ): houdini = Houdini() fvPos = list() fvPosDeriv = list() fvNeg = list() fvNegDeriv = list() irrelevantFeatures = () irrelevantIndices = [] boolFeatures = baseBoolFeatures + derivBoolFeatures for idx in range(0, len(boolFeatures)): #region pruneFunction feature = boolFeatures[idx] if is_int(feature.varZ3): assert (False) (fvPos, fvPosDeriv, fvNeg, fvNegDeriv) = self.splitSamplesImplication( feature, idx + lookAhead, baseFv, derivFv) #if len(fvPos) == 0 or len(fvNeg) == 0: #irrelevantIndices.append(idx) #continue (posIntBaseFv, posBoolBaseFv) = Featurizer.getBoolAndIntFeatureVectors( intBaseFeatures, baseBoolFeatures, fvPos) (negIntBaseFv, negBoolBaseFv) = Featurizer.getBoolAndIntFeatureVectors( intBaseFeatures, baseBoolFeatures, fvNeg) posFvs = Featurizer.mergeFeatureVectors(posBoolBaseFv, fvPosDeriv) negFvs = Featurizer.mergeFeatureVectors(negBoolBaseFv, fvNegDeriv) (posAllTrueFormula, posIndicesAllwaysTrue) = houdini.learn2( boolFeatures, posFvs, call + " from implication check-- split from pred " + str(feature)) (negAllTrueFormula, negIndicesAllwaysTrue) = houdini.learn2( boolFeatures, negFvs, call + " from implication check-- split from pred " + str(feature)) if len(fvPos) != 0 and len(fvNeg) != 0: logger.info(call + " implication check-- split pred: " + str(feature)) logger.info(call + " implication check-- featurePos: " + str(posAllTrueFormula.toInfix())) logger.info(call + " implication check-- featureNeg: " + str(negAllTrueFormula.toInfix()) + "\n") #disjunct z3 type disjunct = Or(And(posAllTrueFormula.formulaZ3, feature.varZ3), And(negAllTrueFormula.formulaZ3, Not(feature.varZ3))) implication = Implies(alwaysTrueFormula.formulaZ3, disjunct) solver = Solver() # check (not (postK0 => postK1)) is unsat solver.add(Not(implication)) check = solver.check() #splitting on `feature does not` add new information: alwaysTrueFormula -> (OR(f and posSplit, ~f and negSplit)) is valid if str(check) == 'unsat': #collect irrelevant features and indices to remove irrelevantFeatures = irrelevantFeatures + (feature, ) irrelevantIndices.append(idx) #splitting adds new information elif str(check) == 'sat': pass else: # solver does not know assert (False) #endregion copyBaseIntFeat = tuple(intBaseFeatures) copyBaseBoolFeat = tuple(baseBoolFeatures) copyDerivFeat = tuple(derivBoolFeatures) #(remainingBaseBoolFeat, remainingDerivBoolFeat, featuresRemoved) = \ # self.removeFeatureFromFeaturelist(boolBaseFeat, derivFeats, indicesAllwaysTrue) (intBaseFv, boolBaseFv) = Featurizer.getBoolAndIntFeatureVectors( copyBaseIntFeat, copyBaseBoolFeat, baseFv) (copyRemainingBaseBoolFeat, copyRemainingDerivBoolFeat, featuresRemoved) = \ self.removeFeatureFromFeaturelist(copyBaseBoolFeat, copyDerivFeat, irrelevantIndices) #boolFvs = Featurizer.mergeFeatureVectors(boolBaseFv, derivFv) (copyReaminingEntriesBaseBoolFv, reaminingEntriesDerivBoolFv) = \ self.removeFeatureEntryInFeatureVectors(boolBaseFv, derivFv, irrelevantIndices) #Debug Check if (len(copyRemainingBaseBoolFeat) + len(copyRemainingDerivBoolFeat)) == 0: return (None, -1, None, None, None, None) skipAhead = len(intBaseFv[0]) newBaseFv = Featurizer.mergeFeatureVectors( intBaseFv, copyReaminingEntriesBaseBoolFv) (f, idx, posBaseFv, negBaseFv) = self.chooseFeature2( copyRemainingBaseBoolFeat + copyRemainingDerivBoolFeat, newBaseFv, reaminingEntriesDerivBoolFv, call, skipAhead) #print(irrelevantIndices) #intBaseFeatures = copyBaseIntFeat #baseBoolFeatures = copyRemainingBaseBoolFeat #erivBoolFeatures = copyDerivFeat #baseFv = newBaseFv #derivFv = reaminingEntriesDerivBoolFv return (f, idx, posBaseFv, negBaseFv, copyRemainingBaseBoolFeat, copyRemainingDerivBoolFeat)
optparser = optparse.OptionParser() optparser.add_option("-d", "--dir", dest="dir", default="data/", help="Root data directory") optparser.add_option("-f", "--features", dest="features", default='light', help="Comma separated list of feature groups to use") optparser.add_option("-v", "--save", dest="save", default=False, action="store_true", help="Train a model and save it to the specified file.") optparser.add_option("-m", "--modelfile", dest="modelfile", help="File to read model from/write model to.") optparser.add_option("-p", "--predict", dest="predict", default=False, action="store_true", help="Load a saved mode and use it to on unseen data.") optparser.add_option("-x", "--feature_selection", dest="feature_selection", default=False, action="store_true", help="Print performance of feature groups one at a time.") optparser.add_option("-e", "--extra_train", dest="extra_train", default=None, type="string", help="Add extra (possibly out of domain) data to training") optparser.add_option("-a", "--ablation", dest="ablation", default=False, action="store_true", help="Run ablation analysis by feature group.") optparser.add_option("-r", "--print_best_features", dest="print_best_features", default=False, action="store_true", help="Print features with highest weights.") (opts, _) = optparser.parse_args() label_file = "%s/labels"%opts.dir ftzr = Featurizer(use=opts.features) if opts.predict : bundle = pickle.load(open(opts.modelfile)) clf = bundle['clf'] dv = bundle['dv'] _, X, _, nm = get_data(label_file, ftzr, dv, encodeY=False) else : y, X, dv, nm = get_data(label_file, ftzr) yplus = None Xplus = None nmplus = None if opts.extra_train is not None : for dr in opts.extra_train.split(',') : label_file = "%s/labels"%dr
fragment = "your head look like a ball however hubert has a head which is a polygon this difference derives from the fact that hubert is gamma perturbation stable" fragment = fragment.split(" ") all_grams = {} counter = 0 for word in fragment: all_grams[word] = (counter, 0) counter += 1 all_grams["your head"] = (counter, 0) counter += 1 all_grams["a ball"] = (counter, 0) counter += 1 all_grams["derives from"] = (counter, 0) counter += 1 featurizer = Featurizer(Settings()) result = {} process(fragment, featurizer, result) print result ### DELETED """ calc_cooccurence(fragment, all_grams) is a function that will calculate the coocccurence matrix for a given fragment and dumps the partial result into a dictionary. These dictionaries from different fragments should be combined to generate the final result. fragment: A list of strings/tokens (not integer labels) representing the raw text. Note - Windows crossing fragments will be ignored; to make the cooccurence result more precise, fragment should be a relatively longer list all_grams: A dictionary of n-grams (including 1-gram/word) that we will care about. This dict