def run(self): soup = super(ForbesScraper, self).get_soup_object() # Gets the breaking article from Forbes investing page headline = soup.find_all( "a", {"class": "headlink h1--dense card__color--benjamins-green"})[0] headline_text = headline.text headline_link = headline.get('href', '') print("----") print("Breaking article: %s" % (headline_text)) print("Breaking article link: %s" % (headline_link)) txt_classifier = Classifier(headline_text) print(txt_classifier.sentiment()) # Gets the editors' picks on the left side latest_picks = {} for latest_picks_article in soup.find_all( "a", {"class": "section-pick__title"}): self.article_link, link = latest_picks_article.get( 'href', ''), latest_picks_article.get('href', '') self.article_title, title = latest_picks_article.text, latest_picks_article.text latest_picks[link] = title print("----") print("Latest pick link title: %s" % (title)) print("Latest pick link: %s" % (link)) self.classify_headline(title)
class POSTagger(): def __init__(self): self.model = Model() self.model.model_load() self.r = Reader() self.r.read_corpus() self.tagger = Classifier(self.r.train_sents, self.model) def evaluate(self, featureset): """ Evaluate the accuracy of the classifer based POS tagger featureset: [[features extracted for a word, tag in a gold standard]] stdout: accuracy_score """ #sequence, tag = featureset gs, labels = [], [] for s, t in featureset: gs.append(t) label = self.tagger.choose_tag(s) labels.append(label) print(t, label) assert (len(gs) == len(labels)) self.write_to_file(labels) words = self.tagger.test(self.r.test_sents, word=True) print(accuracy_score(gs, labels)) def write_to_file(self, labels): with open('labels.txt', 'w') as file_handler: for label in labels: file_handler.write("{}\n".format(label))
def classify_pages(in_path, out_path): classifier = Classifier() with open(out_path, 'wb') as f: for site, html in utils.read_file_multiple(in_path): if classifier.classify(html): pickle.dump((site, html), f)
def classify_headline(self, headline): # Set self.sentiment txt_classifier = Classifier(headline) sentiment = txt_classifier.sentiment() print(sentiment) self.sentiment = sentiment self.update_avgs()
def __init__(self, db): self.db = db self.city = self.db["area"].find_one({ "name": configuration.AREA }) self.classifier = Classifier(self.db)
def getIntent(): print(request.json) print(request.json['sentence']) request_object = request.json sentence = request.json['sentence'] if client is not None: if 'classifier' not in cache.keys(): cache["classifier"] = Classifier() classifier = cache["classifier"] result = classifier.classifyIntent(sentence) classification = dict() print(result) if len(result) > 0: print(result) if result[1] < classifier.ERROR_THRESHOLD: get_database_context().add_not_found_sentence(sentence) classification['intent'] = result[0] else: classification['intent'] = "" get_database_context().add_not_found_sentence(sentence) else: print("NO DATABASE") classification = dict() classification['intent'] = "NO DATABASE" response_object = removekey(request_object, "sentence") response_object["classifications"] = classification return jsonify(response_object)
def getIntent(): request_object = request.json sentence = request.json['sentence'] if client is not None: if 'intents' not in cache.keys(): cache["intents"] = Classifier("intents", client) classifier = cache["intents"] results = classifier.classify(sentence) classification = dict() if len(results) > 0: classification['intent'] = results[0][0] else: classification['intent'] = "" else: print("NO DATABASE") classification = dict() classification['intent'] = "NO DATABASE" response_object = removekey(request_object, "sentence") response_object["classifications"] = classification return jsonify(response_object)
def getEntity(): request_object = request.json sentence = request.json['sentence'] prior_intents = request.json['context']["priorIntent"]["intent"] if client is not None: classifier_name = "entities@" + prior_intents if classifier_name not in cache.keys(): cache[classifier_name] = Classifier(classifier_name, client) classifier = cache[classifier_name] results = classifier.classify(sentence) classification = dict() if len(results) > 0: classification['entity'] = results[0][0] else: classification['entity'] = "" else: print("NO DATABASE") classification = dict() classification['entity'] = "NO DATABASE" response_object = removekey(request_object, "sentence") response_object["classifications"] = classification return jsonify(response_object)
def getEntity(): request_object = request.json sentence = request.json['sentence'] if client is not None: if 'classifier' not in cache.keys(): cache["classifier"] = Classifier() classifier = cache["classifier"] # keep results = classifier.classifyEntity(sentence) # strip keep only name of entity classification = dict() if len(results) > 0: classification['entity'] = results[0][0] else: classification['entity'] = "" else: print("NO DATABASE") classification = dict() classification['entity'] = "NO DATABASE" response_object = removekey(request_object, "sentence") response_object["classifications"] = classification return jsonify(response_object)
def main(config_): config = Config(config_) print("Model Framework: ", config.get("framework"), " Model Labels: ", config.get("labels")) broker = Broker(config) broker.listen() classifier = Classifier(config) thread = threading.Thread(target=process_request, kwargs={"broker": broker, "classifier": classifier}) thread.start()
def train_Engine(): result = get_trainer().start_training() if result: cache["classifier"] = Classifier() cache["classifier"].load(DatabaseContext(client), get_cos_context()) return jsonify("Success! Engine was trained"), 200 else: return jsonify("Error! Engine wasn't trained.."), 404
def trainIntents(): if client is not None: intents = Trainer("intents", client) intents.start_training() if 'intents' not in cache.keys(): cache['intents'] = Classifier('intents', client) else: cache['intents'].load() return jsonify([]) else: print("NO DATABASE") return "NO DATABASE"
def load_sites_feeds(): from tech_rss.models import Site fix_multiprocessing() clf = Classifier() for site in Site.objects.all(): print('Starting {}'.format(site.domain)) news = site.get_new_news() if not news: continue categories = clf.predict(news) for category, page in zip(categories, news): print(CATEGORIES_SHORT[category]) print(page['title'], '\n') url, title = save_post(category, page, site) users = site.users.filter(categories__contains=[category]) users_id = [getattr(user, 'id') for user in users] send_post_to_subscribers(TelegramBot, users_id, url, title)
def __call__(self): # test # self.train = self.train.head(200) # self.test = self.test.head(100) self.clf = Classifier(output_folder=self.output_folder, RS=15, train=self.train, test=self.test, fold_splits=self.splits, clf_name=self.clf_name, mapping_dict=config.mapping_dict) self.clf() print('Saved to %s' % self.output_folder)
def trainEntity(): intent = request.json['intent'] if client is not None: classifier_name = "entities@" + intent entities = Trainer(classifier_name, client) entities.start_training() if classifier_name not in cache.keys(): cache[classifier_name] = Classifier(classifier_name, client) else: cache[classifier_name].load() return jsonify([]) else: print("NO DATABASE") return "NO DATABASE"
def load_sites_feeds(): from tech_rss.models import Site fix_multiprocessing() clf = Classifier() for site in Site.objects.all(): print('Starting {}'.format(site.domain)) news = site.get_new_news() if not news: continue categories = clf.predict(news) for category, page in zip(categories, news): print(CATEGORIES_SHORT[category]) print(page['title'], '\n') url, title = save_post(category, page, site) users = site.users.filter(categories__contains=[category]) users_id = [getattr(user, 'id') for user in users] send_post_to_subscribers(TelegramBot, users_id, url, title)
def main(): classifier = Classifier(model_name="random_forest") logger.debug( "top 20 feature importances: {}".format( get_feature_importance(classifier) ) ) test_features, test_labels = get_test_data("test") logger.debug( "classification report: {}".format( get_classification_report( test_labels["is_returning_customer"].values, classifier.classify(test_features)) ) )
def compare_crawler(): heuristic_file = os.path.join(consts.DATA_DIR, 'using-heuristic-pages.pickle') bsf_file = os.path.join(consts.DATA_DIR, 'bfs-pages.pickle') hr_bfs = os.path.join(consts.RESULTS_DIR, 'bfs_harvest_ratio_results.csv') hr_heuristic = os.path.join(consts.RESULTS_DIR, 'heuristic_harvest_ratio_results.csv') if not os.path.exists(heuristic_file): crawl(True, heuristic_file) if not os.path.exists(bfs_file): crawl(True, bsf_file) classifier = Classifier() harvest_ratio(heuristic_file, hr_heuristic, classifier) harvest_ratio(bsf_file, hr_bfs, classifier)
def testIntent(): request_object = request.json sentence = request.json['sentence'] if client is not None: if sentence == 'populate': # populate database with base data and train all neuronal netwroks populate_intents(client) populate_entities_for_meal(client) populate_entities_for_timetables(client) populate_entities_for_navigation(client) cache["intents"].load() cache["entities@timetables"].load() cache["entities@meal"].load() classification = dict() classification['intent'] = "Populated" else: if 'intents' not in cache.keys(): cache["intents"] = Classifier("intents", client) classifier = cache["intents"] results = classifier.classify(sentence) classification = dict() if len(results) > 0: classification['intent'] = results[0][0] else: classification['intent'] = "" else: print("NO DATABASE") classification = dict() classification['intent'] = "NO DATABASE" response_object = removekey(request_object, "sentence") response_object["classifications"] = classification return 'Results: %s' % classification['intent']
def main(): global reporting, print_classification, classifier args = get_args() # load either web or pop-up reporting based on args reporting_module = 'reporting.' + ('web' if args.web else 'popup') print("Loading " + reporting_module) reporting = importlib.import_module(reporting_module) classifier = Classifier(args.age_gender) # if process is killed with ctrl+c display stats signal.signal(signal.SIGINT, sigint_handler) if args.video is not None: cap = cv2.VideoCapture(args.video) frame_nr = 0 while cap.isOpened(): ret, frame = cap.read() frame = cv2.resize(frame, None, fx=0.25, fy=0.25) if frame_nr % 4 == 0: every_frame(frame, time.time()) frame_nr += 1 if cv2.waitKey(1) & 0xFF == ord('q'): raise SystemExit return if args.file is not None: frame = cv2.imread(args.file) every_frame(frame, time.time()) if cv2.waitKey() & 0xFF == ord('q'): raise SystemExit return if args.print_classification: print_classification = True # on every frame from the stream run stuff stream_video(every_frame)
import telebot from flask import Flask, request import settings from classifier.classifier import Classifier from classifier.data.image_processing import image_from_file bot = telebot.TeleBot(settings.TOKEN,) server = Flask(__name__) # Init image classifier classifier = Classifier( base_net_path=settings.BASIC_NET_PATH, refferi_net_path=settings.REFFERI_NET_PATH, white_net_path=settings.WHITE_NET_PATH, blue_net_path=settings.BLUE_NET_PATH, device=settings.DEVICE ) @bot.message_handler(content_types=['photo']) def get_photo_message(message): """ Predict label of request photos. :param message: massage that contains photo :return: label of photo, string """ # Download photo and save as file object telegram_file_id = message.photo[-1].file_id telegram_file = bot.get_file(telegram_file_id)
class Annotator: def __init__(self, db): self.db = db self.city = self.db["area"].find_one({ "name": configuration.AREA }) self.classifier = Classifier(self.db) def tokenize(self,tweet): stop_words_list = get_stop_words("en") tweet_text = tweet["text"] if tweet["truncated"]: tweet_text = tweet["extended_tweet"]["full_text"] tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text) tokens = [token for token in utils.simple_preprocess( tweet_text, deacc=False, min_len=3) if token not in stop_words_list] tweet["tokens"] = tokens return tweet def add_date(self,tweet): tweet["date"] = datetime.datetime.fromtimestamp(int(tweet["timestamp_ms"]) // 1000) return tweet def annotate_tweet_location(self, tweet): if tweet["geo"] is None and tweet["place"] is None: return tweet point = None if tweet["geo"] is not None: point = Point(tweet["geo"]["coordinates"][1], tweet["geo"]["coordinates"][0]) for a in self.city["geojson"]["features"]: area = shape(a["geometry"]) if (point is not None and area.contains(point)) or a["properties"]["name"] == tweet["place"]["name"]: tweet["area_name"] = a["properties"]["name"] #tweet["area_id"] = a["id"] print("Found a tweet in",tweet["area_name"]) break return tweet def classify_tweet(self, tweet): return self.classifier.classify(tweet) def classify_offline(self): tweets = list(self.db["tweet"].find()) print("Classifying tweets") for t in tweets: print(t["id"]) c_tweet = self.classifier.classify(t) self.db["tweet"].update({"id": t["id"]}, {"$set": {"categories": c_tweet["categories"]}}) print("Done") def tokenize_offline(self): tweets = list(self.db["tweet"].find()) print("Updating tweets") for t in tweets: stop_words_list = get_stop_words("en") tweet_text = t["text"] if t["truncated"]: tweet_text = t["extended_tweet"]["full_text"] tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text) tokens = [token for token in utils.simple_preprocess( tweet_text, deacc=False, min_len=3) if token not in stop_words_list] query = { "_id": t["_id"] } update = { "$set": { "tokens": tokens } } self.db["tweet"].update(query, update) print("Done")
if __name__ == "__main__": from torch.utils import data from sklearn.model_selection import train_test_split from generator.generator import Generator from discriminator.discriminator_semi import SemiSupervisedDiscriminator from classifier.classifier import Classifier from data.data_loader import ImageDataset, ImageTransform, make_datapath_list z_dim = 20 image_size_g = 64 image_size_d = 12 num_classes = 10 G = Generator(image_size_g, z_dim) D = SemiSupervisedDiscriminator(image_size_d, num_classes) C = Classifier(image_size_d, num_classes) G.apply(weights_init) D.apply(weights_init) print("Finish initialization of the network") label_list = list(range(num_classes)) img_list, label_list = make_datapath_list(label_list) train_img_list, test_img_list, train_label_list, test_label_list = train_test_split( img_list, label_list, test_size=0.2) mean = (0.5, ) std = (0.5, ) train_dataset = ImageDataset(data_list=train_img_list, transform=ImageTransform(mean, std),
__author__ = 'dungdt' import time from classifier.classifier import Classifier from classifier.data.dictionary import Dictionary from classifier.data_reader import DataReader if __name__ == '__main__': dictionary = Dictionary() dataReader = DataReader(dictionary) classifier = Classifier(dataReader, trainingDataPath='data/training', testDataPath='data/test') print 'Training...' t = time.time() classifier.train() print 'Training time: %d' %(time.time() - t) t = time.time() print 'Testing...' print 'Accuracy: %s%%' % ('{:4.2f}'.format(classifier.test() * 100)) print 'Testing time: %d' %(time.time() - t) testData = classifier.dataReader.readTestData(classifier.testDataPath) print classifier.classify(testData[0][0])
elif os.path.isfile('vcap-local.json'): with open('vcap-local.json') as f: vcap = json.load(f) print('Found local VCAP_SERVICES') creds = vcap['services']['cloudantNoSQLDB'][0]['credentials'] user = creds['username'] password = creds['password'] url = 'https://' + creds['host'] client = Cloudant(user, password, url=url, connect=True) client.create_database('trainer', throw_on_exists=False) client.create_database('synapse', throw_on_exists=False) cache = dict() if client is not None: # create Classifier cache on startup cache["intents"] = Classifier("intents", client) cache["intents"].load() cache["entities@timetables"] = Classifier("entities@timetables", client) cache["entities@timetables"].load() cache["entities@meal"] = Classifier("entities@meal", client) cache["entities@meal"].load() # On Bluemix, get the port number from the environment variable PORT # When running this app on the local machine, default the port to 8000 port = int(os.getenv('PORT', 8000)) def removekey(d, key): r = dict(d) del r[key] return r
def run(opt): # output dir if os.path.exists(opt.save_dir): shutil.rmtree(opt.save_dir) os.makedirs(opt.save_dir) # load dataset dataset = Dataloader(source=opt.source, imgsz=opt.img_size).dataset # load object detection model, and weights detector = Detector(detector_type=opt.detector_type, cfg_file=opt.detector_cfg_file) detector.run_through_once(opt.img_size) # 空跑一次 # load object tracking model tracker = Tracker(tracker_type=opt.tracker_type, cfg_file=opt.tracker_cfg_file) # load pose detection model poser = Poser(poser_type=opt.poser_type, cfg_file=opt.poser_cfg_file) # load classifier model clssifier = Classifier(classifier_type=opt.classifier_type, cfg_file=opt.classifier_cfg_file) print(detector.device, detector.cfg) filt_with_txt = False # 先分析一下status标注文件.txt,存在的才进行检测,这样能加快速度 if filt_with_txt: from classifier.data_analyse import anaylise_label label_ret = anaylise_label() label_stems = [x[0] for x in label_ret] for img_idx, (path, img, im0s, vid_cap) in enumerate(dataset): # print(type(img), type(im0s)) # print(type(im0s), im0s.shape) if dataset.is_camera: im0s = im0s[0] path = f'{path[0]}/{img_idx:0<6}.jpg' if filt_with_txt: fold_stem = path.split('/')[-2] idx = label_stems.index(fold_stem) # print(fold_stem, label_stems, idx) img_stem = Path(path).stem valid_stems = [Path(x).stem for x in label_ret[idx][-1]] in_it = f'track_{img_stem}' in valid_stems # print(path, in_it, label_ret[idx][-1][0]) if not in_it: continue # img: [3, w, h], preprocess, inference, NMS, det_ret = detector.detect( path, img, im0s) # detect result: nparray, [num_obj, 6] 6: xyxy,conf,cls # detector.imshow(im0s, det_ret) # track tra_ret = tracker.track( det_ret, im0s) # track result: list, [num_obj, 7], 7: xyxy, cls, tid, trace # print(tra_ret[:, 5]) # tracker.imshow(im0s, tra_ret, path) # pose detect pose_ret = poser.detect_pose(tra_ret, im0s, path, return_type='zzd') # zzd format: np.array(object): [num_obj, 10],10: xyxy cls tid trace keypoints kp_score proposal_score # print(pose_ret) poser.imshow(im0s, pose_ret, path, resize=(1280, 720)) # classifier if opt.feature_save_dir is not None: # 保存特征的 clssifier.build_and_save_feature(pose_ret, path, save_dir=opt.feature_save_dir) print(f'\rsaving features: [{img_idx + 1:>3}/{len(dataset)}] ', end='') continue # status_ret = clssifier.detect_status(pose_ret, path, is_camera=dataset.is_camera) # zzd format: np.array(object): [num_obj, 12], 12: 比10多了status_idx和status # clssifier.imshow(im0s, status_ret, show_name='x', resize=(1280, 720)) # print(status_ret) if img_idx == 10: if cv2.waitKeyEx(0) == ord('q'): raise StopIteration
def classify_pattern(cls, pattern): prediction = Classifier.classify(pattern) return cls.PATTERN_MAPPING[int(np.argmax(prediction))]
from classifier.classifier import Classifier classifier = Classifier() classifier.train_model() print(classifier.is_question('do you hold a credit card'))
def __init__(self): self.model = Model() self.model.model_load() self.r = Reader() self.r.read_corpus() self.tagger = Classifier(self.r.train_sents, self.model)
def main(): classifier = Classifier() classifier.build_model() classifier.add_smoothing() classifier.spam_vocabulary_probs, classifier.ham_vocabulary_probs = classifier.write_model_data( 'model.txt', classifier.vocabulary) classifier.test_model('baseline-result.txt', classifier.spam_vocabulary_probs, classifier.ham_vocabulary_probs) print("------Experiment 2, Stop Words Filtering------") classifier.experiment2_stop_words() print("------Experiment 3, Word Length Filtering------") classifier.experiment3_length_filtering() print("------Experiment 4, Frequency 1 Filtering------") classifier.experiment4_frequency_filtering(file_name='frequencyFiltered0', lower_cutoff_frequency=1, higher_cutoff_frequency=1) print("------Experiment 4, Frequency <=5 Filtering------") classifier.experiment4_frequency_filtering(file_name='frequencyFiltered1', lower_cutoff_frequency=0, higher_cutoff_frequency=5) print("------Experiment 4, Frequency <=10 Filtering------") classifier.experiment4_frequency_filtering(file_name='frequencyFiltered2', lower_cutoff_frequency=0, higher_cutoff_frequency=10) print("------Experiment 4, Frequency <=15 Filtering------") classifier.experiment4_frequency_filtering(file_name='frequencyFiltered3', lower_cutoff_frequency=0, higher_cutoff_frequency=15) print("------Experiment 4, Frequency <=20 Filtering------") classifier.experiment4_frequency_filtering(file_name='frequencyFiltered4', lower_cutoff_frequency=0, higher_cutoff_frequency=20) print("------Experiment 4, Top 10 percent Filtering------") classifier.experiment4_most_frequent_filtering('mostFrequencyFiltered0', 10) print("------Experiment 4, Top 15 percent Filtering------") classifier.experiment4_most_frequent_filtering('mostFrequencyFiltered1', 15) print("------Experiment 4, Top 20 percent Filtering------") classifier.experiment4_most_frequent_filtering('mostFrequencyFiltered2', 20) print("------Experiment 4, Top 25 percent Filtering------") classifier.experiment4_most_frequent_filtering('mostFrequencyFiltered3', 25) experiment5_file_name = 'smoothing' for n in range(0, 11): smoothing_value = round((n * 0.1), 1) file_name = experiment5_file_name + str(smoothing_value) print("------Experiment 5, smoothing value %s------" % smoothing_value) classifier_5 = Classifier() classifier_5.build_model() classifier_5.add_smoothing(smoothing_value) classifier_5.spam_vocabulary_probs, classifier_5.ham_vocabulary_probs = classifier_5.write_model_data( file_name + 'model.txt', classifier_5.vocabulary, smoothing_value=smoothing_value) classifier_5.test_model(file_name + 'baseline-result.txt', classifier_5.spam_vocabulary_probs, classifier_5.ham_vocabulary_probs)
import sys import os from PIL import Image from classifier.classifier import Classifier cnn = Classifier(json_file='model.json', weights_file='model.h5') exit_program = False count_true = 0 count_false = 0 while (exit_program == False): type_input = input("Folder(F) or Single File(S)?: ") if type_input == "F" or type_input == "f": if not os.path.exists('animals_and_humans'): os.mkdir('animals_and_humans') if not os.path.exists('nothing'): os.mkdir('nothing') folder_name = input("Folder Name: ") if os.path.exists(folder_name): test_images = os.listdir(folder_name) if len(test_images) > 0: for image in test_images: print(image) if image.startswith('.'): print(image + " not read") else: path_image = "./" + folder_name + "/" + image animal, accuracy = cnn.predict_animal(path_image) if (animal): os.rename(path_image, "./animals_and_humans/" + image) count_true += 1
def main(*args): """Predict the top K classes of an image. Args: *args: args to be parsed by the ArgumentParser Returns: None """ # Instantiating with formatter_class argument will make default values print # in the help message. parser = argparse.ArgumentParser( description='Process an image & report results.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'image_path', type=str, help=('path to the image to process or to a dataset ' + 'directory with images to choose randomly from ' + 'Ex: flowers/test/1/image_06743.jpg or ' + 'flowers/test')) parser.add_argument('checkpoint', type=str, help='path to the model checkpoint to load') parser.add_argument('--top_k', type=int, default=1, help='Return top K most likely classes') parser.add_argument('--category_names', type=str, help='use a mapping of categories to real names') parser.add_argument('--gpu', action='store_true', help=('if available, use gpu to process the image ' + 'instead of the cpu')) args = parser.parse_args(args) if os.path.isdir(args.image_path): print(f'{args.image_path} is a directory.', 'Choosing a random image to process.') image_path = get_random_image_from_dir(args.image_path) print(f'Using image: {image_path}') else: image_path = args.image_path if not os.path.isfile(args.checkpoint): print(f'ERROR: {args.checkpoint} is not a file.', file=sys.stderr) sys.exit(-1) if args.category_names: cat_to_name = load_json(args.category_names) else: cat_to_name = None if args.gpu: device = 'cuda' if not torch.cuda.is_available(): print('ERROR: cuda is not available on this machine.', 'Use cpu for prediction instead.', file=sys.stderr) sys.exit(-1) else: device = 'cpu' classifier = Classifier(checkpoint=args.checkpoint) probs, classes = classifier.predict(image_path, topk=args.top_k, device=device) if cat_to_name is not None: classes = [cat_to_name[c] for c in classes] class_len = len(max(cat_to_name.values(), key=len)) else: class_len = 10 # padding needed to space column 1 title 'Class' below print(f'{"Class":{class_len}}{"Probability"}') for prob, class_ in zip(probs, classes): print(f'{class_:{class_len}}{prob:4.2f}')