def train(self, domain, class_data): document = xml.dom.minidom.Document() node = document.createElement('Tree') document.appendChild(node) d = Trainer(domain, class_data, document) partial_atts = d.attributes partial_atts.remove("Id") partial_atts.remove("Vote") print partial_atts if len(self.restr) > 0: d.rem_restrictions(self.restr) d.c45(d.data, d.attributes, node, 0) self.classifier = Classifier() if len(class_data.category) > 0: self.classifier.has_category = True for row in d.data: self.classifier.classify(document.documentElement, row, class_data.attributes) self.classifier.print_stats()
def evaluate_embeddings(embeddings): X, Y = read_node_label('../data/wiki/wiki_labels.txt') tr_frac = 0.8 print("Training classifier using {:.2f}% nodes...".format( tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def classify(vectors, args): if not os.path.isfile(args.classifydir + '_labels.txt'): return defaultdict(lambda: 0) X, Y = read_node_label(args.classifydir + '_labels.txt') # print("Training classifier using {:.2f}% nodes...".format(args.train_percent * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression(solver="lbfgs", max_iter=4000)) # scores = clf.split_train_evaluate(X, Y, args.train_percent) features, labels, graph, idx_train, idx_val, idx_test = load_dataset( str(args.classifydir.split("/")[-1])) # print(idx_train) # print(type(idx_train)) idx_train = list(idx_train) # idx_val = list(idx_val) # idx_val += list(idx_test)[:600] idx_test = list(idx_test) #[600:] # for i in idx_val: # idx_train.append(i) # idx_val = idx_val[400:] print("TRAINING SIZE", len(idx_train), "VALIDATION SIZE", len(idx_val), "TESTING SIZE: ", len(list(idx_test))) scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_val) # scores = clf.split_train_evaluate(X, Y, args.train_percent) test_scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_test) test_x.append(test_scores['macro']) print("micro:", test_scores['micro'], "macro:", test_scores['macro']) return scores
def main(args): t1 = time.time() g = Graph() print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) t2 = time.time() print(t2 - t1) print("Saving embeddings...") model.save_embeddings(args.output) vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def test_1NN(digitsdat, selected, all_test_m): for testm in all_test_m: classifier = Classifier() classifier.build_model(digitsdat.X[selected[0:testm], :], digitsdat.y[ selected[0:testm]]) print("m=%d error=%f" % ( testm, classifier.classify(digitsdat.testX, digitsdat.testy)))
def classify_singlechar(): singlechardir = r'D:\projects\python\captchabreak\data\icbc\icbc1213_output' outputdir = r'D:\projects\python\captchabreak\data\icbc\icbc_c_output' try: os.mkdir(outputdir) except: pass traindata = r'D:\projects\python\captchabreak\data\icbc\icbcaudit' cc = Classifier(traindata, class_sample_num=20, size = 20) cc.train() count =0 for subdir in cc.classify_map: try: os.mkdir(os.path.join(outputdir, subdir)) except: pass for root, dirs, files in os.walk(singlechardir): for file in files: path = os.path.join(root, file) img = cv.LoadImage(path, 0) response = cc.classify_single_char(img) outputfile = os.path.join(outputdir, response, 'icbc_%d.jpg'%count) cv.SaveImage(outputfile, img) count += 1
def node_classification(session, bs, seqne, sequences, seq_len, node_n, samp_idx, label, ratio): enc_sum_dict = {} node_cnt = {} s_idx, e_idx = 0, bs while e_idx < len(sequences): batch_enc = session.run(seqne.encoder_output, feed_dict={seqne.input_seqs: sequences[s_idx: e_idx], seqne.dropout: 0, seqne.keep_prob: 0}) enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences, batch_enc.astype('float32'), seq_len, s_idx) s_idx, e_idx = e_idx, e_idx + bs if s_idx < len(sequences): batch_enc = session.run(seqne.encoder_output, feed_dict={seqne.input_seqs: sequences[s_idx: len(sequences)], seqne.dropout: 0, seqne.keep_prob: 0}) enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences, batch_enc.astype('float32'), seq_len, s_idx) node_enc_mean = reduce_seq2seq_hidden_avg(sum_dict=enc_sum_dict, count_dict=node_cnt, node_num=node_n) lr = Classifier(vectors=node_enc_mean, clf=LogisticRegression()) f1_micro, f1_macro = lr.split_train_evaluate(samp_idx, label, ratio) return f1_micro
def initialize(self, conf, ctx): self.counter = 0 self.pid = os.getpid() self.total = 0 self.classifier = Classifier() self.directory = str(os.getcwd()) + "/Tweet_Images" if not os.path.exists(self.directory): os.makedirs(self.directory)
def __init__(self,**kargs): Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"]) self.mode = "ngram" self.include_word = True self.inclued_pos = True self.id="repeat{0},s:{1}".format(self.num_items,self.selection) self.repeat_dict = self.get_all_repeats() self.prepare_features()
def classify(vectors, args): if not os.path.isfile(args.classifydir + '_labels.txt'): return defaultdict(lambda: 0) X, Y = read_node_label(args.classifydir + '_labels.txt') print("Training classifier using {:.2f}% nodes...".format( args.train_percent * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression(solver="lbfgs", max_iter=4000)) scores = clf.split_train_evaluate(X, Y, args.train_percent) return scores
class TagCountBolt(Bolt): outputs = ['cls', 'tag', 'date', 'hour'] def initialize(self, conf, ctx): self.counter = 0 self.pid = os.getpid() self.total = 0 self.classifier = Classifier() self.directory = str(os.getcwd()) + "/Tweet_Images" if not os.path.exists(self.directory): os.makedirs(self.directory) #self.logger.info("------CREATED FOLDER--------") def _increment(self, word, inc_by): self.counter[word] += inc_by self.total += inc_by def process(self, tup): data = json.loads(tup.values[0].encode('utf-8')) self.logger.info(data) if 'img_url' in data: path = "{}/{}.jpg".format(self.directory, self.counter) try: urllib.urlretrieve(data['img_url'], path) self.counter = self.counter + 1 self.classifier.load_image(path) predicted_class = self.classifier.classify() #self.logger.info("\n [INFO_BOLT_PREDICTION] : "+ " ".join(predicted_class)) if len(data['hash']) > 0: tags = [ str(li['text']) for li in data['hash'] if li['text'][0:1] != "\\" ] #self.logger.info("\n [INFO_BOLT_TAGS] : "+ " ".join(tags)) now = datetime.datetime.now() now_date = "{:04}-{:02}-{:02}".format( now.year, now.month, now.day) for cls in predicted_class: if len(tags) > 0: for tag in tags: self.emit([cls, tag, now_date, str(now.hour)]) #self.logger.info("{0}/{1}".format(cls,tag)) os.remove(path) except (KeyboardInterrupt, Exception): self.logger.info(Exception) else: self.logger.info("NO IMG URL!!!") #self.logger.info(json.dumps(data)) if self.counter % 10 == 0: self.logger.info("Processed [{:,}] tweets".format(self.counter))
def __init__(self, **kargs): Classifier.__init__(self,tagged_tweets=kargs["tagged_tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"]) self.mode = kargs["mode"] self.inclued_pos = kargs["pos"] self.include_word = kargs["word"] self.mode= kargs["mode"] self.id="context{0},m:{1},w:{2},t:{3}".format(self.num_items,self.mode,self.include_word,self.inclued_pos) self.debug=False self.ranked_ngrams = kargs["ranked_ngrams"] self.num_ngrams = len(self.ranked_ngrams) self.context_dict = self.build_ct_dict() self.prepare_features()
def test_1NN(digitsdat, selected, all_test_m): for testm in all_test_m: classifier = Classifier() # model = build(digitsdat.X[selected[0:testm], :], digitsdat.y[selected[0:testm]]) classifier.build_model(digitsdat.X[selected[0:testm], :], digitsdat.y[selected[0:testm]]) error = classifier.classify(digitsdat.testX, digitsdat.testy) # accuracy = res(model) # print("m=%d error=%f" % (testm, 100-accuracy)) print("m=%d error=%f" % (testm, error)) # global M, e M.append(testm) e.append(error)
def main(args): node_embeddings = load_embeddings(args.embedding_file) if args.label_file: labels = read_node_label(args.label_file) if args.modularity: print("Modularity") modularity(args, node_embeddings, args.min_k, args.max_k) if args.reconstruction: print("Graph reconstruction") reconstr(args, node_embeddings, args.k_nbrs) if args.clustering: print("Clustering") clustering(node_embeddings, labels, args.exp_times) if args.link_prediction: print("Link prediction") link_prediction(args.input, node_embeddings) if args.classification: X = list(labels.keys()) Y = list(labels.values()) print("Node classification") clf_ratio_list = args.clf_ratio.strip().split(',') result_list = {} train_ratio = np.asarray(range(1, 10)) * .1 for clf_ratio in train_ratio: # clf_ratio_list: result_per_test = [] for ti in range(args.exp_times): clf = Classifier(vectors=node_embeddings, clf=LogisticRegression()) myresult = clf.split_train_evaluate(X, Y, float(clf_ratio)) result_per_test.append(myresult) result_list[clf_ratio] = result_per_test print('-------------------') for clf_ratio in train_ratio: print('Train percent:', clf_ratio) results = result_list[clf_ratio] for index, result in enumerate(results): print('Shuffle #%d: ' % (index + 1), result) avg_score = defaultdict(float) for score_dict in results: for metric, score in score_dict.items(): avg_score[metric] += score for metric in avg_score: avg_score[metric] /= len(results) print('Average score:', dict(avg_score)) print('-------------------')
def __init__(self, **kargs): Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"]) self.mode = kargs["mode"] self.inclued_pos = kargs["pos"] self.include_word = kargs["word"] self.mode= kargs["mode"] self.rank = kargs['rank'] #int(self.num_ngrams *self.keep_features) self.id="{1}{0},w:{2},t:{3},s:{4},r:{5}".format(self.num_items,self.mode,self.include_word,self.inclued_pos,self.selection,self.rank) self.debug=False self.ngram_dict = self.get_ranked_ngrams() self.ranked_ngrams = sorted(self.ngram_dict,key = lambda x: self.ngram_dict[x],reverse=True) self.ngram_dict = {} self.num_ngrams = len(self.ranked_ngrams) self.prepare_features()
def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher()
def node_classification( embeddings, label_path, name, size): X, Y = read_node_label( embeddings, label_path,) f_c=open('results/%s_classification_%d.txt'%(name, size), 'w') all_ratio=[] for tr_frac in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: print(" Training classifier using {:.2f}% nodes...".format(tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression(), name=name) results= clf.split_train_evaluate(X, Y, tr_frac) avg='macro' f_c.write(name+' train percentage: '+ str(tr_frac)+ ' F1-'+avg+ ' '+ str('%0.5f'%results[avg])) all_ratio.append(results[avg]) f_c.write('\n')
def prepare_data(filename): """ Parse and classify text from file with given filename to prepare data to generate the report. """ parser = Parser() classifier = Classifier() class_duration = { # Dictionary for duration of each class 'W': timedelta(hours=0, minutes=0), 'R': timedelta(hours=0, minutes=0), 'N': timedelta(hours=0, minutes=0), } action_durations = {} # Dictionary for duration of each action activities = parser.parse_file(filename) for activity in activities: duration = activity['duration'] actions = activity['actions'] divided_duration = duration / len(actions) for action in actions: classification = classifier.classify_action(action) class_duration[classification] += divided_duration if action in action_durations: action_durations[action] += divided_duration else: action_durations[action] = divided_duration sorted_action_durations = sorted(action_durations.items(), key=lambda tup: tup[1], reverse=True) # Add structure to data and return plot_data = { 'summary_pie_chart': { 'class_duration': class_duration, }, } report_data = { 'sorted_action_durations': sorted_action_durations, 'class_duration': class_duration, 'timedelta_to_string': timedelta_to_string, } return plot_data, report_data
def evaluate_embeddings_with_split(vectors, X_train, Y_train, X_test, Y_test, Y_all, testnum=10): print("Training classifier with the pre-defined split setting...") #clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf = Classifier(vectors=vectors, clf=SVC(probability=True)) micro_list = [] macro_list = [] for i in range(testnum): res = clf.evaluate_with_fixed_split(X_train, Y_train, X_test, Y_test, Y_all) micro_list.append(res['micro']) macro_list.append(res['macro']) return mean(micro_list), mean(macro_list)
def __init__(self, graph, lr=.001, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file=None, clf_ratio=0.5, auto_stop=True): self.rep_size = rep_size self.order = order self.best_result = 0 self.vectors = {} self.model = _LINE(graph, lr, rep_size, batch_size, negative_ratio, order=self.order) for i in range(epoch): self.model.train_one_epoch() if label_file: self.get_embeddings() X, Y = read_node_label(label_file) print "Training classifier using {:.2f}% nodes...".format( clf_ratio * 100) clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) if result['micro'] < self.best_result and auto_stop: self.vectors = self.last_vectors print 'Auto stop!' return elif result['micro'] > self.best_result: self.best_result = result['micro'] self.get_embeddings()
def main(): traindata = r'D:\projects\python\captchabreak\data\doubanaudit' douban = Classifier(traindata, 5) douban.train() testdata = r'D:\projects\python\captchabreak\data\doubanseg' outputdir = r'D:\projects\python\captchabreak\data\doubanoutput' count =0 for subdir in douban.classify_map: try: os.mkdir(os.path.join(outputdir, subdir)) except: pass for root, dirs, files in os.walk(testdata): for file in files: path = os.path.join(root, file) img = cv.LoadImage(path, 0) response = douban.classify_single_char(img) outputfile = os.path.join(outputdir, response, 'douban_%d.jpg'%count) cv.SaveImage(outputfile, img) count += 1
def main(args): print("xnetmf", "begin...") t1 = time.time() print("Reading...") nx_graph = nx.read_edgelist(agrs.input, nodetype=int, comments="%") adj_matrix = nx.adjacency_matrix(nx_graph).todense() print(adj_matrix) g = Graph(adj_matrix) rep_method = RepMethod( max_layer=2 ) # Learn representations with xNetMF. Can adjust parameters (e.g. as in REGAL) representations = src.xnetmf.get_representations(g, rep_method) print(representations) print(representations.shape) print("TAWD", "begin...") print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = xtadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) t2 = time.time() print(t2 - t1) print("Saving embeddings...") model.save_embeddings(args.output) vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] # 当天相关推文记录,存储共离线分析 self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher()
def main(argv): # Constants for the analyzer and the classifier dataset = 'commit_comments-dump.2015-01-29.json' group = 'id' model_file = 'model.pickle' # Create the analyzer analyzer = Analyzer(group) # Create the classifier algorithm_class = RandomForestRegressor algorithm_parameters = { 'n_estimators': 100, 'n_jobs': 2, 'min_samples_split': 10 } classifier = Classifier(group, model_file) classifier.create_model(train=True, class_name=algorithm_class, parameters=algorithm_parameters) # Compare analyzer output with classifier output and identify differences unrecognized_negative = {} unrecognized_positive = {} predictions = classifier.predict() line = 0 # Dataset line i = 0 # Prediction ID (+1) file = open(dataset, 'rb') for data in Utilities.read_json(file, 'id', group): line = line + 1 if line % 1000 == 0: print(line) if not classifier.filter(data): continue i = i + 1 message = data['message'] score = analyzer.analyze(message)[0] if score == 0: continue diff = predictions[i - 1] - score if abs(diff) < 1.0: continue target = unrecognized_negative if diff < 0 else unrecognized_positive target[line] = diff result = sorted(unrecognized_positive.items(), key=lambda x: x[1]) for item in result: print("{}: {}: {}".format(item[0], item[1], linecache.getline(dataset, item[0])[:-1]))
def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_save = True): self.rep_size = rep_size self.order = order self.best_result = 0 self.vectors = {} if order == 3: self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1) self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2) for i in range(epoch): self.model1.train_one_epoch() self.model2.train_one_epoch() if label_file: self.get_embeddings() X, Y = read_node_label(label_file) print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100) clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) if result['macro'] > self.best_result: self.best_result = result['macro'] if auto_save: self.best_vector = self.vectors else: self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order) for i in range(epoch): self.model.train_one_epoch() if label_file: self.get_embeddings() X, Y = read_node_label(label_file) print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100) clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) if result['macro'] > self.best_result: self.best_result = result['macro'] if auto_save: self.best_vector = self.vectors self.get_embeddings() if auto_save and label_file: self.vectors = self.best_vector
def main(argv): folds = int(argv[0]) if len(argv) > 0 else 5 filter = argv[1].lower() if len(argv) > 1 else "" # Fields to check whether the filter, if given, appears in. filter_fields = ['name', 'class_name', 'module'] # Read the manifest containing algorithm descriptions. with open('algorithms.json', 'r') as manifest: algorithms = json.load(manifest) # Load previous results try: with open('experiment_results.json', 'r') as file: results = json.load(file) except: results = {} for algorithm in algorithms: # Skip running the algorithm if it is disabled or the filter name does # not appear in any of the fields. if 'disabled' in algorithm and algorithm['disabled']: continue if filter and all( [filter not in algorithm[k].lower() for k in filter_fields]): continue # Convert manifest entries to classifier class and parameters class_name = Utilities.get_class(algorithm['module'], algorithm['class_name']) dense = algorithm['dense'] if 'dense' in algorithm else False # Create all possible combinations of parameters. parameter_combinations = itertools.product( *algorithm['parameters'].values()) single_parameters = [ param for param, values in algorithm['parameters'].iteritems() if len(values) == 1 ] string_parameters = [ param for param, values in algorithm['parameters'].iteritems() if isinstance(values[0], (str, unicode)) ] for combination in parameter_combinations: classifier = Classifier('id') # Turn the selected parameter combination back into a dictionary parameters = dict(zip(algorithm['parameters'].keys(), combination)) # Create the model according to the parameters classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense) Utilities.print_algorithm(algorithm['name'], parameters) parameter_string = Utilities.get_parameter_string( parameters, single_parameters + string_parameters) # Run cross-validation and print results result = classifier.output_cross_validate(folds) print('') name = algorithm['name'] for param in string_parameters: name += ", %s=%s" % (param, parameters[param]) # Write the result measurements into the results dictionary. if name not in results: results[name] = OrderedDict() results[name].update({ parameter_string: { 'average': result.mean(), 'standard_deviation': result.std() } }) # Write intermediate results (back) into a pretty-printed JSON file with open('experiment_results.json', 'w') as file: json.dump(results, file, indent=4, separators=(',', ': '))
import io import numpy as np from torch import nn import torch.utils.model_zoo as model_zoo import torch.onnx import onnxruntime import cv2 from classify import Classifier, get_default_args if __name__ == '__main__': cfg = get_default_args() classify = Classifier(cfg) torch_model = classify.model torch_model.to(torch.device('cpu')) x = torch.randn(1, 3, 224, 224) frame = cv2.imread('/home/cmf/datasets/spoof/NUAADetectedface/Detectedface/ImposterFace/0001/0001_00_00_01_0.jpg') img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) img = cv2.resize(img, (224, 224)) # img = np.float32(img) img = img.astype(np.float32) # print(img.max(), img.min()) img = img / 255 # print(img.max(), img.min()) img = np.transpose(img, (2, 0, 1))
""" This is a demo about how to use LibLINEAR to do the prediction """ from classify import Classifier # Create an object first, use this object all the time test = Classifier() # When a query comes, Example query query = "Can we talk about Terminator" # Do the prediction and get information plausible = set([5,7,8]) p_label = test.action_info(query,plausible) print p_label
def __init__(self,**kargs): Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"]) self.id="tagcount{0},s:{1}".format(self.num_items,self.selection)
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) self.schedule.enter(0, 0, self.dump_schedule, ()) self.schedule.run() self.process() def process(self): start_day = time.gmtime(time.time()).tm_mday for line in sys.stdin: tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = similarity * evaluate_score if total_score < 0.15: continue is_pushed = self.pusher.push(evaluate_score, topic_id) if is_pushed: delivery_time = time.time() self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) self.pushed_tweets_ids.add(tid_retweet) struct_time = time.gmtime(float(timestamp[:-3])) utc_time = time.strftime('%Y%m%d', struct_time) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text]) self.related_tweets_hash.add(simhash_value) if struct_time.tm_mday != start_day: self.dump_result(start_day) start_day = struct_time.tm_mday def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/'+file_name, 'w') as fw: for index, records in enumerate(self.related_tweets): pid = str(index+226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA')) with open('submit/task-a/'+file_name, 'w') as fw: with open('submit/task-a_extr/'+file_name, 'w') as fw_extr: for index, records in enumerate(self.pushed_tweets): pid = str(index+226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] self.pushed_tweets = [[] for _ in range(225)]
if len(sys.argv) > 3 or len(sys.argv) < 2: print("Please Enter valid parameter:") print("Parameter: Congress term number") print("Option: --skip, avoid data cleaning") sys.exit() if len(sys.argv) == 2: congress_id = str(sys.argv[1]) if os.path.isfile("rawData/" + "speeches_" + congress_id + ".txt") and os.path.isfile("rawData/" + congress_id + "_SpeakerMap.txt"): print("cleaning ....") data_cleaner = Cleaner([congress_id]) data_cleaner.clean_pipeline() print("classifying ....") congress_classifier = Classifier([congress_id]) congress_classifier.base_pipeline() print("done.") sys.exit() else: print( "There are no speeches and speakerMap text file to process for congress " + congress_id) print( "Please put the target congress raw text data into rawData directory" ) sys.exit() if len(sys.argv) == 3 and (sys.argv[1] == "--skip" ): # skip data cleaning (data is already cleaned) congress_id = str(sys.argv[2])
import pickle from mailbox import Client, MessageHandler from colorfind import colorfind, loadScalars, connComps from sys import argv from coords import avg, mul, toInt, split, diff, from_cv2_to_mm_centered from classify import Classifier cap = cv2.VideoCapture(1) cap2 = cv2.VideoCapture(2) host = '192.168.43.83' port = 6164 # fix number msg = [0] n = 0 stop_event = threading.Event() # cap.set(cv2.CAP_PROP_POS_MSEC, 15000) nn = Classifier() def nothing(x): return x def sign(x): if x > 0: return 1. elif x < 0: return -1. elif x == 0: return 0. else: return x
break elif file_mode == 'w+': friends_file = open(complete_name, 'w+') print("enter the names of your FB contacts [Ctrl-D to save it]: ") while True: try: line = input() except EOFError: break friends_file.write('%r\n' %line) break else: print('incorrect input, please try again\n') friends_file.seek(0) c = Classifier() from_lengths = {} to_lengths = {} from_sentiments = {} to_sentiments = {} friends = [] for l in friends_file: a,b = l.split(" ") friend_name = a + ' ' + b friends.append(friend_name) for friend_name in friends: print(friend_name)
class Validator: def __init__(self, restrictions): self.attributes = []; self.true_pos = 0 self.true_neg = 0 self.false_pos = 0 self.false_neg = 0 if len(restrictions) > 0: self.restr = restrictions.restr else: self.restr = restrictions def train(self, domain, class_data): document = xml.dom.minidom.Document() node = document.createElement('Tree') document.appendChild(node) d = Trainer(domain, class_data, document) partial_atts = d.attributes partial_atts.remove("Id") partial_atts.remove("Vote") print partial_atts if len(self.restr) > 0: d.rem_restrictions(self.restr) d.c45(d.data, d.attributes, node, 0) self.classifier = Classifier() if len(class_data.category) > 0: self.classifier.has_category = True for row in d.data: self.classifier.classify(document.documentElement, row, class_data.attributes) self.classifier.print_stats() #def print_stats(self): def recall(self): TP = self.classifier.true_pos FN = self.classifier.false_neg return float(TP) / (TP + FN) def precision(self): TP = self.classifier.true_pos FP = self.classifier.false_pos return float(TP) / (TP + FP) def pf(self): TN = self.classifier.true_neg FP = self.classifier.false_pos return float(FP) / (FP + TN) def fmeasure(self): beta = 2 return float(beta * self.precision() * self.recall()) / (self.precision() + self.recall()) def confusion_matrix(self): print "###### CONFUSION MATRIX #######" print " | Classified Positive | Classified Negative |" print "Actual Positive | " + self.classifier.true_pos + " | " + self.classifier.false_neg + " |" print "Actual Negative | " + self.classifier.false_pos + " | " + self.classifier.true_neg + " |"
def main(args): print("number-walks " + str(args.number_walks)) print("representation-size " + str(args.representation_size)) print("walk-length " + str(args.walk_length)) print("inout_fle " + str(args.input)) print("******") g = Graph() deepw = False #similarity thresholds for compration trsl = [0.45, 0.495, 0.5, 0.55, 0.6, 0.7, 0.8, 1] # trsl=[ 0.5 ] learn = True X, Y = read_node_label(args.label_file) seed = 0 clsratio = [0.01, 0.05, 0.07, 0.1, 0.25, 0.5, 0.7, 0.8] # clsratio=[ 0.1,0.2,0.4, 0.6,0.7,0.8,0.9]#,0.7,0.8]# use for blogcatalog np.random.seed(seed) shuffle_indices = np.random.permutation(np.arange(len(X))) f = open(args.input + "shu.txt", "w") f.writelines(str(item) + "\n" for item in shuffle_indices) f.close() if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input, directed=args.directed) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) G = g.G print("before spar, n: " + str(len(G.nodes())) + " m: " + str(len(G.edges()))) #compute similarity score for compression t1 = time.time() p = pC(G, 0.45) scoreNode = p.ScoreCompute() t3 = time.time() f = open(args.input + "score.txt", "w") f.writelines( str(n[0]) + " " + str(n[1]) + " " + str(scoreNode[n]) + "\n" for n in scoreNode) f.close() print("total scorecom time: " + str(t3 - t1)) # read similarity scores from file # f=open(args.input+"score.txt","r") # scoreNode=dict() # for x in f: # l=x.split() # scoreNode[((l[0]),(l[1]))] = float(l[2]) for kk in range(0, len(trsl)): if learn: # do embeding ths = trsl[kk] #args.trs print("threshold is ...", ths) if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input, directed=args.directed) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) if ths != 1: #compression t1 = time.time() G = g.G G, nl2 = makeCompression(G, scoreNode, ths) f = open(args.input + "af_spar.txt", "w") f.writelines(str(n) + " " + str(nl2[n]) + "\n" for n in nl2) f.close() writeg(G, args) t2 = time.time() print("total_sparc_time: " + str(t2 - t1)) #embedding t1 = time.time() print("After_compresing,n,m " + str(len(g.G.nodes())) + " " + str(len(g.G.edges()))) model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, p=args.p, q=args.q, window=args.window_size, dw=deepw) t2 = time.time() print("total_embeding_time " + str(t2 - t1)) vectors = model.vectors if ths != 1: #add embedding of removed nodes in compression addBack(nl2, vectors) np.save(args.output + "_" + str(ths) + ".npy", vectors) else: #load embeddings vectors = np.load(args.output + "_" + str(ths) + ".npy") vectors = vectors.item(0) print("file_loaded") #print("Training classifier") #split_train_evaluate2 for single label (cora and wiki) #split_train_evaluate for multi lable (dblp and blogcatalog) for r in clsratio: clfa = Classifier(vectors, clf=LogisticRegression(solver='liblinear')) res = clfa.split_train_evaluate2( X, Y, r, shuffle_indices) # args.clf_ratio) print(str(r) + " " + str(res["macro"]) + " " + str(res["micro"]))
def main(argv): folds = int(argv[0]) if len(argv) > 0 else 5 filter = argv[1].lower() if len(argv) > 1 else "" # Fields to check whether the filter, if given, appears in. filter_fields = ['name', 'class_name', 'module'] # Read the manifest containing algorithm descriptions. with open('algorithms.json', 'r') as manifest: algorithms = json.load(manifest) # Load previous results try: with open('experiment_results.json', 'r') as file: results = json.load(file) except: results = {} for algorithm in algorithms: # Skip running the algorithm if it is disabled or the filter name does # not appear in any of the fields. if 'disabled' in algorithm and algorithm['disabled']: continue if filter and all([filter not in algorithm[k].lower() for k in filter_fields]): continue # Convert manifest entries to classifier class and parameters class_name = Utilities.get_class(algorithm['module'], algorithm['class_name']) dense = algorithm['dense'] if 'dense' in algorithm else False # Create all possible combinations of parameters. parameter_combinations = itertools.product(*algorithm['parameters'].values()) single_parameters = [param for param,values in algorithm['parameters'].iteritems() if len(values) == 1] string_parameters = [param for param,values in algorithm['parameters'].iteritems() if isinstance(values[0],(str,unicode))] for combination in parameter_combinations: classifier = Classifier('id') # Turn the selected parameter combination back into a dictionary parameters = dict(zip(algorithm['parameters'].keys(), combination)) # Create the model according to the parameters classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense) Utilities.print_algorithm(algorithm['name'], parameters) parameter_string = Utilities.get_parameter_string(parameters, single_parameters + string_parameters) # Run cross-validation and print results result = classifier.output_cross_validate(folds) print('') name = algorithm['name'] for param in string_parameters: name += ", %s=%s" % (param,parameters[param]) # Write the result measurements into the results dictionary. if name not in results: results[name] = OrderedDict() results[name].update({ parameter_string: { 'average': result.mean(), 'standard_deviation': result.std() } }) # Write intermediate results (back) into a pretty-printed JSON file with open('experiment_results.json', 'w') as file: json.dump(results, file, indent=4, separators=(',', ': '))
from bottle import (route, run, template, request, redirect) from parse import get_news, extract_next_page from db import News, session from scripts import save from classify import Classifier s = session() classifier = Classifier() mark_news = s.query(News).filter(News.label != None).all() x_title = [row.title for row in mark_news] y_lable = [row.label for row in mark_news] classifier.fit(x_title, y_lable)
subj = "" for line in text.splitlines(): if line.startswith("subject:"): is_subj = True subj = line[8:] else: lines.append(line) return email_extract(subj, " ".join(lines), min_len, max_len) def test_enron_files(classifier, path, label, selector=None): files = get_file_list(path, selector) correct = total = 0 for filename in files: with open(os.path.join(path, filename)) as fh: contents = fh.read() features = enron_email_extract(contents) res = classifier.classify(features) best = res[0][0] if best == label: correct += 1 total += 1 pct = 100 * (float(correct) / total) print 'Accuracy of "%s": %s%% based on %s documents' % (label, pct, total) if __name__ == "__main__": classifier = Classifier.load("classifier.bin") for d, l in get_dir_and_labels(): test_enron_files(classifier, d, l)
def load_classifier(): if not os.path.exists('classifier.db'): sys.stderr.write('Unable to load classifier.db -- please run this script first') sys.exit(1) return Classifier.load('classifier.db')
"link_type": surl.get_link_type_text(), "mime_type": surl.get_mime_type(), "short_url": surl.get_short_url(), "long_url": surl.get_long_url(), "short_code": surl.get_short_code(), "referrers": {}, # {ref->count} "locations": {}, # {IP->count} "hits": [], "social": surl.get_social(), "thumb": thumb, } # collect the stats itr = sdb.list_hits(surl.get_short_code()) for hit in itr: classifier = Classifier(sdb, surl, hit) # Referrers ref = hit["referrer"] if ref not in params["referrers"]: params["referrers"][ref] = 0 params["referrers"][ref] += 1 hit["bot"] = classifier.is_bot() hit["time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hit["time"])) hit["reltime"] = "2h" hit["cc"] = classifier.get_country_code('??') hit["area"] = classifier.get_region('??')
def load_classifier(): if not os.path.exists('classifier.db'): classifier = train_all(None , corpus) classifier.save('classifier.db') return Classifier.load('classifier.db')
# # main # rec.start_recording() # initialise the tts speech engine engine = pyttsx3.init() # initialise Math class calculator = Computation() # initialise Classifier to find the right command Path classifier = Classifier() print('Please Speak now') while True: samples = rec.get_samples() audio, finalize = vad.process_audio(samples) if not audio: continue logging.debug('decoding audio len=%d finalize=%s audio=%s' % (len(audio), repr(finalize), audio[0].__class__)) user_utt, confidence = asr.decode(audio, finalize, stream_id=STREAM_ID)
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] # 当天相关推文记录,存储共离线分析 self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) # self.schedule.enter(0, 0, self.dump_schedule, ()) # self.schedule.run() self.process() def process(self): data_file_path = sys.argv[1] files = os.listdir(data_file_path) files.sort() for f in files: filename = os.path.join(data_file_path, f) logging.info(filename) count = 0 for line in open(filename, 'rb'): start = time.clock() tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue count += 1 if count % 10000 == 0: logging.info('%d' % count) tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = (evaluate_score ** 0.5) * similarity # if total_score < 0.15: # continue timestruct = time.gmtime(int(timestamp[:-3])) is_pushed = self.pusher.push(total_score, topic_id, timestruct) if is_pushed: delivery_time = float(timestamp) / 1000.0 + (time.clock() - start) self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) utc_time = time.strftime('%Y%m%d', timestruct) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text]) self.related_tweets_hash.add(simhash_value) self.pushed_tweets_ids.add(tid_retweet) self.dump_result(f) self.pusher = Pusher() self.logger_info.info('\n=======finished!=======\n') def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/b_submit', 'a') as fw: with open('submit/task-b/b_review/B_candidateday_' + file_name[-2:], 'w') as fw_review: for index, records in enumerate(self.related_tweets): pid = str(index+226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA')) fw_review.write('%s\tMB%s\tQ0\t%s\t%f\tSNACS\t%s\t%s\t%s\n' % (record[0], pid, record[1], record[2], record[3], record[4], record[5])) with open('submit/task-a/a_submit', 'a') as fw: with open('submit/task-a/a_review', 'a') as fw_review: for index, records in enumerate(self.pushed_tweets): pid = str(index+226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_review.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] # 清空前天相关推文记录 self.pushed_tweets = [[] for _ in range(225)] def dump_schedule(self): self.logger_info.info('saving result...') utc_time = time.strftime('%Y%m%d', time.gmtime()) for index, records in self.related_tweets: pid = str(index+226) with open('profile_MB' + pid, 'w') as fw: for record in records: fw.write(utc_time + '\t' + pid + '\tQ0\t' + record + '\n') self.related_tweets = [[] for _ in range(226)] # 清空前天相关推文记录 self.schedule.enter(24*60*60, 0, self.dump_schedule, ()) def detect_tweet_stream(self, year, month, d, h, m, s, ms): start = datetime.datetime(year, month, d, h, m, s, ms) delta = (start - datetime.datetime.now()).seconds self.logger_info.info('waiting secondes: ' + str(delta)) time.sleep(delta) self.logger_info.info('tweet stream is ready') is_ready = True return is_ready
# # results_file = open('0127_best_result_wiki.txt', 'w') # results_file.write(" %s\n" % best_result) # # np.savetxt('best_result_wiki.out', best_result, delimiter='\t') # X is an array clf_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] node_size = len(vectors) train_x = np.array([vectors[x] for x in X]) reshaped_train_x = train_x.reshape((train_x.shape[0], args.kstep, node_size)) train_x = low_encoder.predict(reshaped_train_x) for clf_one in clf_list: print "Training classifier using {:.2f}% nodes...".format(clf_one * 100) clf = Classifier(vectors=train_x, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, clf_one) # y_lable = np.array(Y) # print(train_x.shape) # print(type(train_x)) # print(type(y_lable)) # # np.savetxt('train_x.out', train_x, delimiter='\t') # X is an array # np.savetxt('train_Y.out', y_lable.astype(int), delimiter='\t') # X is an array # # # print(results) # results_file = open('citeseer_result_0.3.txt', 'w') # for item in results: # results_file.write("%s\n" % item)
def __init__(self,**kargs): Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"]) self.polarity_dict = kargs["polarity_dict"] self.tag_map = kargs["tag_map"] self.id="weib{0},s:{1}".format(self.num_items,self.selection) self.prepare_features()
from classify import Classifier cl = Classifier() print(cl.classify('./test/'))
class Validator: def __init__(self, restrictions): self.attributes = [] self.true_pos = 0 self.true_neg = 0 self.false_pos = 0 self.false_neg = 0 if len(restrictions) > 0: self.restr = restrictions.restr else: self.restr = restrictions def train(self, domain, class_data): document = xml.dom.minidom.Document() node = document.createElement('Tree') document.appendChild(node) d = Trainer(domain, class_data, document) partial_atts = d.attributes partial_atts.remove("Id") partial_atts.remove("Vote") print partial_atts if len(self.restr) > 0: d.rem_restrictions(self.restr) d.c45(d.data, d.attributes, node, 0) self.classifier = Classifier() if len(class_data.category) > 0: self.classifier.has_category = True for row in d.data: self.classifier.classify(document.documentElement, row, class_data.attributes) self.classifier.print_stats() #def print_stats(self): def recall(self): TP = self.classifier.true_pos FN = self.classifier.false_neg return float(TP) / (TP + FN) def precision(self): TP = self.classifier.true_pos FP = self.classifier.false_pos return float(TP) / (TP + FP) def pf(self): TN = self.classifier.true_neg FP = self.classifier.false_pos return float(FP) / (FP + TN) def fmeasure(self): beta = 2 return float(beta * self.precision() * self.recall()) / (self.precision() + self.recall()) def confusion_matrix(self): print "###### CONFUSION MATRIX #######" print " | Classified Positive | Classified Negative |" print "Actual Positive | " + self.classifier.true_pos + " | " + self.classifier.false_neg + " |" print "Actual Negative | " + self.classifier.false_pos + " | " + self.classifier.true_neg + " |"