def download(query, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True): # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' image_dir = Path(output_dir).joinpath(query).absolute() if force_replace: if Path.isdir(image_dir): shutil.rmtree(image_dir) # check directory and create if necessary try: if not Path.is_dir(image_dir): Path.mkdir(image_dir, parents=True) except Exception as e: print('[Error]Failed to create directory.', e) sys.exit(1) print("[%] Downloading Images to {}".format(str(image_dir.absolute()))) bing = Bing(query, limit, image_dir, adult, timeout, verbose) bing.run()
def create_files_from_internet(target, query, file_types=[Bing.TXT_FILE_TYPE], num_created=1, verbose=False): """ Download files based on a query, to the target directory. :param target: target directory (created in it does not exist) :param query: query (for the Bing search engine) :param file_types: list of file types to search for :param num_created: number of files to download :param verbose: print to console messages for each file downloaded :return: None """ count = int(num_created) if not num_created or count <= 0: return # create target directory if it does not exist if not os.path.exists(target): os.makedirs(target) if verbose: print 'created files' bing = Bing(KEY) files_per_type = count / len(file_types) remainder = count - files_per_type * len(file_types) file_tuples = [] for ft in file_types: num = files_per_type + 1 if remainder > 0 else files_per_type file_tuples.append((ft, num)) remainder -= 1 display = print_file_details if verbose else None total = bing.execute(target, query, file_tuples, display=display) if verbose: print 'created %d files' % total
def download(query, limit=100, output_dir='dataset', image_dir='sample', adult_filter_off=True, force_replace=False, timeout=60): # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' cwd = os.getcwd() # image_dir = os.path.join(cwd, output_dir, query) if force_replace: if os.path.isdir(image_dir): shutil.rmtree(image_dir) # check directory and create if necessary try: if not os.path.isdir("{}/{}/".format(cwd, output_dir)): os.makedirs("{}/{}/".format(cwd, output_dir)) except: pass if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, query)): os.makedirs("{}/{}/{}".format(cwd, output_dir, query)) bing = Bing(query, limit, output_dir, adult, timeout) bing.run()
def download(query, limit=100, output_dir='dataset', adult_filter_off=True, timeout=60, dedup=True): # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' bing = Bing(query, limit, output_dir, adult, timeout) bing.run()
class SearchEngine: def __init__(self): self.searchType = { SEARCH_IMAGE: self.__searchImage, SEARCH_WEB: self.__searchWeb } self.bing = Bing() def __searchImage(self, query, options=None): imageFilter = '' if options and options['searchEntity']: options['filter'] = IMAGE_FILTER[options['searchEntity']] result = self.bing.searchImage(query, options) return {'url': result[u'd'][u'results']} def __searchWeb(self, query, options=None): result = self.bing.searchWeb(query, options) websites, allURL = [], defaultdict(list) for entry in result[u'd'][u'results']: entryURL = entry[u'Url'] urlObj = tldextract.extract(entryURL) siteName = urlObj.domain if not siteName: continue # ignoring anything that fails parser else: allURL[siteName.lower()].append(entryURL) websites.append(entryURL) return {'ranking': allURL, 'url': websites} def search(self, query, options=None): '''returns a list of URLs from search engine result''' result = {SEARCH_IMAGE: None, SEARCH_WEB: None} query += " " + options['searchEntity'] if options and 'type' in options: for searchType in options['type']: result[searchType] = self.searchType[searchType](query, options) else: for searchType in result: result[searchType] = self.searchType[searchType](query, options) return result
def __init__(self, model_file, pretrained_file, gpu=False, mean=None, input_scale=None, raw_scale=None, channel_swap=None, context_pad=None, weights_1st_stage_bing=None, sizes_idx_bing=None, weights_2nd_stage_bing=None, num_bbs_psz_bing=130, num_bbs_final_bing=1500): """ Take gpu, mean, input_scale, raw_scale, channel_swap: params for preprocessing options. context_pad: amount of surrounding context to take s.t. a `context_pad` sized border of pixels in the network input image is context, as in R-CNN feature extraction. """ caffe.Net.__init__(self, model_file, pretrained_file) self.set_phase_test() if gpu: self.set_mode_gpu() else: self.set_mode_cpu() if mean is not None: self.set_mean(self.inputs[0], mean) if input_scale is not None: self.set_input_scale(self.inputs[0], input_scale) if raw_scale is not None: self.set_raw_scale(self.inputs[0], raw_scale) if channel_swap is not None: self.set_channel_swap(self.inputs[0], channel_swap) self.configure_crop(context_pad) if bing_flag and not weights_1st_stage_bing is None and not sizes_idx_bing is None and not weights_2nd_stage_bing is None: self.bing = Bing(weights_1st_stage=weights_1st_stage_bing, sizes_idx=sizes_idx_bing, weights_2nd_stage=weights_2nd_stage_bing, num_bbs_per_size_1st_stage=num_bbs_psz_bing, num_bbs_final=num_bbs_final_bing) else: self.bing = None
def bing_search(self): key = my_keys.MICROSOFT_API_KEY bing = Bing(key) items = bing.web_search(self.query, 50, ['Title', 'Url', 'Description']) pages = [] for item in items: if type(item) == str: continue page = WebPage(item['Url']) page.query = self.query #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def download(query, limit=100, adult_filter_off=True, force_replace=False): engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' cwd = os.getcwd() image_dir = os.path.join(cwd, 'dataset', engine, query) if force_replace: if os.path.isdir(image_dir): shutil.rmtree(image_dir) # check directory and create if necessary try: if not os.path.isdir("{}/dataset/".format(cwd)): os.makedirs("{}/dataset/".format(cwd)) except: pass if not os.path.isdir("{}/dataset/{}/{}".format(cwd, engine, query)): os.makedirs("{}/dataset/{}/{}".format(cwd, engine, query)) Bing().bing(query, limit, adult)
class SearchEngine: def __init__(self): self.searchType = {SEARCH_IMAGE:self.__searchImage, SEARCH_WEB:self.__searchWeb} self.bing = Bing() def __searchImage(self, query, options=None): imageFilter = '' if options and options['searchEntity']: options['filter'] = IMAGE_FILTER[options['searchEntity']] result = self.bing.searchImage(query, options) return {'url':result[u'd'][u'results']} def __searchWeb(self, query, options=None): result = self.bing.searchWeb(query, options) websites, allURL = [], defaultdict(list) for entry in result[u'd'][u'results']: entryURL = entry[u'Url'] urlObj = tldextract.extract(entryURL) siteName = urlObj.domain if not siteName: continue # ignoring anything that fails parser else: allURL[siteName.lower()].append(entryURL) websites.append(entryURL) return {'ranking':allURL, 'url':websites} def search(self, query, options=None): '''returns a list of URLs from search engine result''' result = {SEARCH_IMAGE:None, SEARCH_WEB:None} query += " "+options['searchEntity'] if options and 'type' in options: for searchType in options['type']: result[searchType] = self.searchType[searchType](query, options) else: for searchType in result: result[searchType] = self.searchType[searchType](query, options) return result
def download(query, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60): # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' try: li = pd.read_csv('links.csv') link = li['Links'].to_list() fname = li['Files'].to_list() queries = li['Queries'].to_list() # start = fname.split('.')[0] + 1 except: link = [] fname = [] queries = [] # start = '1' cwd = os.getcwd() image_dir = os.path.join(cwd, output_dir, query) if force_replace: if os.path.isdir(image_dir): shutil.rmtree(image_dir) # check directory and create if necessary try: if not os.path.isdir("{}/{}/".format(cwd, output_dir)): os.makedirs("{}/{}/".format(cwd, output_dir)) except: pass if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, query)): os.makedirs("{}/{}/{}".format(cwd, output_dir, query)) bing = Bing(query, limit, output_dir, adult, timeout, link, fname, queries) links, files, queries = bing.run() d = {'Files': files, 'Queries': queries, 'Links': links} lin = pd.DataFrame(d) if not os.path.exists("{}/{}".format(cwd, "links.csv")): lin.to_csv("{}/{}".format(cwd, "links.csv")) else: os.remove("{}/{}".format(cwd, "links.csv")) lin.to_csv("{}/{}".format(cwd, "links.csv"))
def download(query, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60): try: # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' cwd = os.getcwd() image_dir = os.path.join(cwd, output_dir, query) if force_replace: if os.path.isdir(image_dir): shutil.rmtree(image_dir) # check directory and create if necessary try: if not os.path.isdir("{}/{}/".format(cwd, output_dir)): os.makedirs("{}/{}/".format(cwd, output_dir)) except: pass if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, query)): os.makedirs("{}/{}/{}".format(cwd, output_dir, query)) bing = Bing(query, limit, output_dir, adult, timeout) bing.run() except Exception as e: print('downlaoder.py') print(e) if __name__ == '__main__': download('abitabh', limit=10, timeout='1')
class Main: def __init__(self): super().__init__() self.b = Bing() self.appData = AppData() self.icon = None def run(self): self.createIcon() self.icon.run() def createIcon(self): image = self.b.getNextImage() self.appData.saveImage(image) wall = WindowsWallpaper() wall.setWallpaper(self.appData.getImagePath()) image = Image.open("icon.png") menu = Menu(MenuItem('Next Image', self.nextI), MenuItem('Prev Image', self.prevI), MenuItem('Quit', self.quitI)) self.icon = pystray.Icon("pyBing v0.1", image, "pyBing v0.1", menu) def nextI(self): image = self.b.getNextImage() self.appData.saveImage(image) wall = WindowsWallpaper() wall.setWallpaper(self.appData.getImagePath()) def prevI(self): image = self.b.getPreviousImage() self.appData.saveImage(image) wall = WindowsWallpaper() wall.setWallpaper(self.appData.getImagePath()) def quitI(self): print('Exit') self.icon.stop() pass
def collect_data(): key = "TIwk7p7nC7HlKijRb5Z42IHx0S2+MKHqAS0BNIOdKqM" name_list = ['Hillary', 'bill'] bing = Bing(key) save_dir = './raw_image/' if not os.path.exists(save_dir): os.mkdir(save_dir) for name in name_list: save_dir = './raw_image/' + name + '/' if not os.path.exists(save_dir): os.mkdir(save_dir) results = bing.web_search(name, 3, ["MediaUrl"]) for num, result in enumerate(results): try: scrape_image(result['MediaUrl'], save_dir + str(num) + '.jpg') except Exception as e: print(e) continue
def download(query, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, no_directory=False): # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' cwd = os.getcwd() image_dir = os.path.join(cwd, output_dir, query) if force_replace: if os.path.isdir(image_dir): shutil.rmtree(image_dir) # check output directory and create if necessary try: if not os.path.isdir("{}/{}/".format(cwd, output_dir)): os.makedirs("{}/{}/".format(cwd, output_dir)) except: pass # create extra directories if they don't exist and if no_directory parameter is false if not no_directory: if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, query)): # print("making dirs") os.makedirs("{}/{}/{}".format(cwd, output_dir, query)) bing = Bing(query, limit, output_dir, adult, timeout, no_directory) bing.run()
def download(query, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, visited_urls={}, return_visited_url=False): # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' cwd = os.getcwd() image_dir = os.path.join(cwd, output_dir, query) if force_replace: if os.path.isdir(image_dir): shutil.rmtree(image_dir) # check directory and create if necessary try: if not os.path.isdir("{}/{}/".format(cwd, output_dir)): os.makedirs("{}/{}/".format(cwd, output_dir)) except: pass if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, query)): os.makedirs("{}/{}/{}".format(cwd, output_dir, query)) bing = Bing(query, limit, output_dir, adult, timeout, visited_urls) bing.run() # added : retrun Dict of visited urls if user want if return_visited_url: return bing.visited_urls
def extract_features(extractor, img_idx): bing_params = bing_param_setting(bing_param_file) bing_detector = Bing(bing_params['w_1st'], bing_params['sizes'], bing_params['w_2nd'], num_bbs_per_size_1st_stage=bing_params["num_win_psz"], num_bbs_final=bing_params["num_bbs"]) pca = joblib.load("data/learned_PCA.pkl") relations = {} formatted_proposals = [] indexes = [] list_f = open(img_list_file) img_lst = list_f.read().split() img_lst = img_lst[img_idx - 200:img_idx] for img_name in img_lst: img_name = img_name.strip() if img_name == "" or img_name[-3:] != "jpg": continue img = os.path.join(data_dir, img_name) # k : number of regions proposals, rels = get_proposals(extractor, bing_detector, img, k=30) for idx in range(len(proposals[1])): indexes.append((img_name, idx)) formatted_proposals.append(proposals) relations[img_name] = reduce_rel(rels) features = extractor.extract_features(formatted_proposals, layer='fc6') features = post_process(features, pca) f = open("data/features/%d.pkl" % img_idx, "wb") pickle.dump(features, f) f.close() f = open("data/indexes/%d.pkl" % img_idx, "wb") pickle.dump(indexes, f) f.close() f = open("data/relations/%d.pkl" % img_idx, "wb") pickle.dump(relations, f) f.close()
def bing(location, key='', proxies='', timeout=5.0): """ Retrieves geocoding data from Bing's REST location API. >>> key = 'XXXXX' >>> g = geocoder.bing('Medina, Washington', key=key) >>> g.latlng (47.615821838378906, -122.23892211914062) >>> g.country 'United States' ... Official Docs ------------- http://msdn.microsoft.com/en-us/library/ff701714.aspx """ provider = Bing(location, key=key) return Geocoder(provider, proxies=proxies, timeout=timeout)
def extract_features(extractor): bing_params = bing_param_setting(bing_param_file) bing_detector = Bing(bing_params['w_1st'], bing_params['sizes'], bing_params['w_2nd'], num_bbs_per_size_1st_stage=bing_params["num_win_psz"], num_bbs_final=bing_params["num_bbs"]) pca = joblib.load("data/learned_PCA.pkl") query_list = open(query_list_file, "r") query_dict = {} for q in query_list: query_name = q.strip() if q == "": continue print query_name query, crop = ox5k_get_query(gt_dir, query_name) img = os.path.join(data_dir, query) proposals, rels = get_proposals(extractor, bing_detector, img, k=30, crop=crop) formatted_proposals = [proposals] features = extractor.extract_features(formatted_proposals, layer='fc6') features = post_process(features, pca) query_dict[query_name] = {} query_dict[query_name]["feature"] = features query_dict[query_name]["relation"] = reduce_rel(rels) query_list.close() f = open("data/query.pkl", "wb") pickle.dump(query_dict, f) f.close()
class EvaluateRecall(object): def __init__(self, w_1st, sizes_idx, w_2nd, num_bbs_per_size_1st_stage= 130, num_bbs_final = 1500): self.w_1st = w_1st self.sizes_idx = sizes_idx self.w_2nd = w_2nd self.bing = Bing(w_1st,sizes_idx,w_2nd, num_bbs_per_size_1st_stage = num_bbs_per_size_1st_stage, num_bbs_final = num_bbs_final) def evaluate_test_set(self, test_annotations): tot_num_gt_bbs = 0 print "Getting ground truth and predicted bounding boxes from testing images." images_bbs_dict = dict() for key in test_annotations.keys(): ann_dict = test_annotations[key] fn = os.path.join(ann_dict["path"],ann_dict["basename"]) img = cv2.imread(fn) if img is None: warnings.warn("The image %s does not exist in the filesystem."%fn) #calculating features for each ground truth bounding box bbs = ann_dict["bbs"] predicted_bbs, _ = self.bing.predict(img) tot_num_gt_bbs = tot_num_gt_bbs + len(bbs) images_bbs_dict[ann_dict["basename"]] = (bbs, predicted_bbs) print "Calculate the recall of predicted bounding boxes that overlap at least the 50% with ground truth bounding boxes." overlaps_array = np.zeros(tot_num_gt_bbs) gt_bbs_idx = 0 for img_bn in images_bbs_dict.keys(): gt_bbs, predicted_bbs = images_bbs_dict[img_bn] for i, gt_bb in enumerate(gt_bbs): overlaps_array[gt_bbs_idx+i] = bounding_box_overlap(predicted_bbs, bb_query = gt_bb["bb"]) gt_bbs_idx = gt_bbs_idx + len(gt_bbs) detected = (overlaps_array>0.5).astype(float) recall = np.sum(detected)/len(detected) return recall
def download(query, limit=100, adult_filter_off=True, force_replace=False, output_dir=None, timeout=30, page_counter_limit=5): engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' if output_dir is None: output_dir = os.path.join(os.getcwd(), 'dataset') query_dir = os.path.join(output_dir, query) if force_replace: if os.path.isdir(query_dir): shutil.rmtree(query_dir) # check output directory and create if necessary try: if not os.path.isdir(output_dir): os.makedirs(output_dir) except: pass # check query directory and create if necessary print('Query dir: {}'.format(query_dir)) if not os.path.isdir(query_dir): os.makedirs(query_dir) Bing().bing(query=query, limit=limit, adlt=adult, output_dir=query_dir, timeout=timeout, page_counter_limit=5)
def __init__(self, model_file, pretrained_file, gpu=False, mean=None, input_scale=None, raw_scale=None, channel_swap=None, context_pad=None, weights_1st_stage_bing = None, sizes_idx_bing = None, weights_2nd_stage_bing = None, num_bbs_psz_bing = 130, num_bbs_final_bing = 1500): """ Take gpu, mean, input_scale, raw_scale, channel_swap: params for preprocessing options. context_pad: amount of surrounding context to take s.t. a `context_pad` sized border of pixels in the network input image is context, as in R-CNN feature extraction. """ caffe.Net.__init__(self, model_file, pretrained_file) self.set_phase_test() if gpu: self.set_mode_gpu() else: self.set_mode_cpu() if mean is not None: self.set_mean(self.inputs[0], mean) if input_scale is not None: self.set_input_scale(self.inputs[0], input_scale) if raw_scale is not None: self.set_raw_scale(self.inputs[0], raw_scale) if channel_swap is not None: self.set_channel_swap(self.inputs[0], channel_swap) self.configure_crop(context_pad) if bing_flag and not weights_1st_stage_bing is None and not sizes_idx_bing is None and not weights_2nd_stage_bing is None: self.bing = Bing(weights_1st_stage = weights_1st_stage_bing, sizes_idx = sizes_idx_bing ,weights_2nd_stage = weights_2nd_stage_bing, num_bbs_per_size_1st_stage= num_bbs_psz_bing, num_bbs_final = num_bbs_final_bing) else: self.bing = None
def test_bing_scrape_search_result(self): with open('ipbing') as fp: bing_search_result = Bing.scrape_search_result(BeautifulSoup(fp)) self.assertEqual(BING_SEARCH_RESULT, bing_search_result)
def __init__(self): self.searchType = { SEARCH_IMAGE: self.__searchImage, SEARCH_WEB: self.__searchWeb } self.bing = Bing()
def load_bing_model(self, model_file=DEFAULT_MODEL_FILE): logging.info("Load Bing Model ...") self.bing = Bing(2, 8, 2) self.bing.loadTrainModel(model_file)
def _search(self, page_num): key = my_keys.MICROSOFT_API_KEY_2 bing = Bing(key) items = bing.web_search(self.query, page_num, ['Title', 'Url', 'Description']) return items
import sys sys.path.append('lib') from bing import Bing appID = 'Hko5cXg5U8h/WIE46pYQjmo/MLXNNkXYr+VXx/a66Ig' bing = Bing(appID) print bing.search(query='mootools',sources='web')
class Detector(caffe.Net): """ Detector extends Net for windowed detection by a list of crops or selective search proposals. """ def __init__(self, model_file, pretrained_file, gpu=False, mean=None, input_scale=None, raw_scale=None, channel_swap=None, context_pad=None, weights_1st_stage_bing=None, sizes_idx_bing=None, weights_2nd_stage_bing=None, num_bbs_psz_bing=130, num_bbs_final_bing=1500): """ Take gpu, mean, input_scale, raw_scale, channel_swap: params for preprocessing options. context_pad: amount of surrounding context to take s.t. a `context_pad` sized border of pixels in the network input image is context, as in R-CNN feature extraction. """ caffe.Net.__init__(self, model_file, pretrained_file) self.set_phase_test() if gpu: self.set_mode_gpu() else: self.set_mode_cpu() if mean is not None: self.set_mean(self.inputs[0], mean) if input_scale is not None: self.set_input_scale(self.inputs[0], input_scale) if raw_scale is not None: self.set_raw_scale(self.inputs[0], raw_scale) if channel_swap is not None: self.set_channel_swap(self.inputs[0], channel_swap) self.configure_crop(context_pad) if bing_flag and not weights_1st_stage_bing is None and not sizes_idx_bing is None and not weights_2nd_stage_bing is None: self.bing = Bing(weights_1st_stage=weights_1st_stage_bing, sizes_idx=sizes_idx_bing, weights_2nd_stage=weights_2nd_stage_bing, num_bbs_per_size_1st_stage=num_bbs_psz_bing, num_bbs_final=num_bbs_final_bing) else: self.bing = None def detect_bing(self, image): assert not self.bing is None if not bing_flag: print "Bing detection invoked but error while importing bing module!" sys.exit(1) t0 = time.time() bbs, scores = self.bing.predict(image) t1 = time.time() print "Bing prediction: {0:.2f}s.".format(t1 - t0) images_windows = self.detect_windows(image, bbs) return self.get_predictions_from_cropped_images(images_windows) def detect_windows(self, image, bbs): """ Do windowed detection over given images and windows. Windows are extracted then warped to the input dimensions of the net. Take images_windows: (image filename, window list) iterable. context_crop: size of context border to crop in pixels. Give detections: list of {filename: image filename, window: crop coordinates, predictions: prediction vector} dicts. """ images_windows = [] image_fl = img_as_float(image) t0 = time.time() for bb in bbs: bb = np.array((bb[1], bb[0], bb[3], bb[2])) images_windows.append((self.crop(image_fl, bb), bb)) t1 = time.time() print "Bounding boxes cropping: {0:.2f}s.".format(t1 - t0) return images_windows def get_predictions_from_cropped_images(self, images_windows): # Run through the net (warping windows to input dimensions). caffe_in = np.zeros( (len(images_windows), images_windows[0][0].shape[2]) + self.blobs[self.inputs[0]].data.shape[2:], dtype=np.float32) bbs = [] for ix, (window_in, bb) in enumerate(images_windows): caffe_in[ix] = self.preprocess(self.inputs[0], window_in) bbs.append(bb) out = self.forward_all(**{self.inputs[0]: caffe_in}) predictions = out[self.outputs[0]].squeeze(axis=(2, 3)) # Package predictions with images and windows. detections = [] ix = 0 for bb in bbs: detections.append({ 'window': bb, 'prediction': predictions[ix], }) ix += 1 return detections, predictions def crop(self, im, window): """ Crop a window from the image for detection. Include surrounding context according to the `context_pad` configuration. Take im: H x W x K image ndarray to crop. window: bounding box coordinates as ymin, xmin, ymax, xmax. Give crop: cropped window. """ # Crop window from the image. crop = im[window[0]:window[2], window[1]:window[3]] if self.context_pad: box = window.copy() crop_size = self.blobs[self.inputs[0]].width # assumes square scale = crop_size / (1. * crop_size - self.context_pad * 2) # Crop a box + surrounding context. half_h = (box[2] - box[0] + 1) / 2. half_w = (box[3] - box[1] + 1) / 2. center = (box[0] + half_h, box[1] + half_w) scaled_dims = scale * np.array((-half_h, -half_w, half_h, half_w)) box = np.round(np.tile(center, 2) + scaled_dims) full_h = box[2] - box[0] + 1 full_w = box[3] - box[1] + 1 scale_h = crop_size / full_h scale_w = crop_size / full_w pad_y = round(max(0, -box[0]) * scale_h) # amount out-of-bounds pad_x = round(max(0, -box[1]) * scale_w) # Clip box to image dimensions. im_h, im_w = im.shape[:2] box = np.clip(box, 0., [im_h, im_w, im_h, im_w]) clip_h = box[2] - box[0] + 1 clip_w = box[3] - box[1] + 1 assert (clip_h > 0 and clip_w > 0) crop_h = round(clip_h * scale_h) crop_w = round(clip_w * scale_w) if pad_y + crop_h > crop_size: crop_h = crop_size - pad_y if pad_x + crop_w > crop_size: crop_w = crop_size - pad_x # collect with context padding and place in input # with mean padding context_crop = im[box[0]:box[2], box[1]:box[3]] context_crop = caffe.io.resize_image(context_crop, (crop_h, crop_w)) crop = self.crop_mean.copy() crop[pad_y:(pad_y + crop_h), pad_x:(pad_x + crop_w)] = context_crop return crop def configure_crop(self, context_pad): """ Configure amount of context for cropping. If context is included, make the special input mean for context padding. Take context_pad: amount of context for cropping. """ self.context_pad = context_pad if self.context_pad: raw_scale = self.raw_scale.get(self.inputs[0]) channel_order = self.channel_swap.get(self.inputs[0]) # Padding context crops needs the mean in unprocessed input space. mean = self.mean.get(self.inputs[0]) if mean is not None: crop_mean = mean.copy().transpose((1, 2, 0)) if channel_order is not None: channel_order_inverse = [ channel_order.index(i) for i in range(crop_mean.shape[2]) ] crop_mean = crop_mean[:, :, channel_order_inverse] if raw_scale is not None: crop_mean /= raw_scale self.crop_mean = crop_mean else: self.crop_mean = np.zeros( self.blobs[self.inputs[0]].data.shape, dtype=np.float32)
def test_bing_scrape_search_result(self): with open('ipbing') as fp: bing_search_result = Bing.scrape_search_result(BeautifulSoup(fp, 'html.parser')) self.assertEqual(BING_SEARCH_RESULT, bing_search_result)
from bing import Bing from dailymotion import Dailymotion from duckduckgo import Duckduckgo from exalead import Exalead from google import Google from mojeek import Mojeek from parsijoo import Parsijoo from quora import Quora from yahoo import Yahoo from yandex import Yandex from youtube import Youtube scrapers = { 'ask': Ask(), 'baidu': Baidu(), 'bing': Bing(), 'dailymotion': Dailymotion(), 'duckduckgo': Duckduckgo(), 'exalead': Exalead(), 'google': Google(), 'mojeek': Mojeek(), 'parsijoo': Parsijoo(), 'quora': Quora(), 'yahoo': Yahoo(), 'yandex': Yandex(), 'youtube': Youtube() } def small_test(): assert isinstance(scrapers['google'].search('fossasia'), list)
def __init__(self): super().__init__() self.b = Bing() self.appData = AppData() self.icon = None
from flask import Flask, url_for, render_template, redirect, session, request from app import app from bing import Bing import os bing = Bing(None) bing.toggle() @app.route('/search', methods=["GET"]) def search(): query = request.args.get("q") if not query: redirect('/index') bing.query = query images = bing.get_images() return render_template("search.html", images=images, cache_bust=os.path.getmtime("app/static/style.css")) @app.route('/index', methods=["GET"]) @app.route('/', methods=["GET"]) def index(): return render_template("search.html", images=[], cache_bust=os.path.getmtime("app/static/style.css"))
def run1(self): s = Bing.search(self.search_term) return s
def __init__(self, w_1st, sizes_idx, w_2nd, num_bbs_per_size_1st_stage= 130, num_bbs_final = 1500): self.w_1st = w_1st self.sizes_idx = sizes_idx self.w_2nd = w_2nd self.bing = Bing(w_1st,sizes_idx,w_2nd, num_bbs_per_size_1st_stage = num_bbs_per_size_1st_stage, num_bbs_final = num_bbs_final)
def test_bing_scrape_news_result(self): with open('ipbingnews') as fp: bing_news_result = Bing.scrape_news_result(BeautifulSoup(fp, 'html.parser')) self.assertEqual(BING_NEWS_RESULT, bing_news_result)
def test_bing_scrape_news_result(self): with open('ipbingnews') as fp: bing_news_result = Bing.scrape_news_result(BeautifulSoup(fp)) self.assertEqual(BING_NEWS_RESULT, bing_news_result)
class bing_cluster: def __init__(self, cluster_num=10, top_k=10, max_ratio=4, min_size=100): logging.info("Init the bing and cluster parameter") self.cluster_num = cluster_num self.top_k = top_k self.max_ratio = max_ratio self.min_size = min_size self.spectral = cluster.SpectralClustering(n_clusters=self.cluster_num, affinity='precomputed') def load_bing_model(self, model_file=DEFAULT_MODEL_FILE): logging.info("Load Bing Model ...") self.bing = Bing(2, 8, 2) self.bing.loadTrainModel(model_file) def get_bing_of_image(self, image_filename, numPerSz=130): boxes = self.bing.getBoxesOfOneImage(image_filename, numPerSz) ymins = [s for s in boxes.ymins()] ymaxs = [s for s in boxes.ymaxs()] xmins = [s for s in boxes.xmins()] xmaxs = [s for s in boxes.xmaxs()] bing_windows = pd.DataFrame({ 'ymin': ymins, 'xmin': xmins, 'ymax': ymaxs, 'xmax': xmaxs }) return bing_windows def get_iou_distance_matrix(self, bing_windows): window_size = bing_windows.shape[0] y1 = bing_windows["ymin"].values x1 = bing_windows["xmin"].values y2 = bing_windows["ymax"].values x2 = bing_windows["xmax"].values w = x2 - x1 h = y2 - y1 area = (w * h).astype(float) distances = np.zeros((window_size, window_size)) for i in range(window_size): xx1 = np.maximum(x1[i], x1) yy1 = np.maximum(y1[i], y1) xx2 = np.minimum(x2[i], x2) yy2 = np.minimum(y2[i], y2) w = np.maximum(0., xx2 - xx1) h = np.maximum(0., yy2 - yy1) wh = w * h distances[i] = wh / (area[i] + area - wh) return distances def cluster_boxes(self, bing_windows): starttime = time.time() distance_matrix = self.get_iou_distance_matrix(bing_windows) self.spectral.fit(distance_matrix) #get top of each cluster window_size = bing_windows.shape[0] y1 = bing_windows["ymin"].values x1 = bing_windows["xmin"].values y2 = bing_windows["ymax"].values x2 = bing_windows["xmax"].values w = x2 - x1 h = y2 - y1 area = (w * h).astype(float) index_dictionary = {} for i in range(window_size): #if(area[i]<self.min_size): # continue #if(w[i]*1.0/h[i]>self.max_ratio or h[i]*1.0/w[i]>self.max_ratio): # continue label = self.spectral.labels_[i] if not label in index_dictionary: index_dictionary[label] = [] if len(index_dictionary[label]) >= self.top_k: continue index_dictionary[label].append(i) index_list = [] #for key in index_dictionary: # index_list.extend(index_dictionary[key]) while True: empty = True for key in index_dictionary: one_list = index_dictionary[key] if len(one_list): empty = False index_list.append(one_list.pop(0)) if empty: break bing_windows = pd.DataFrame({ "ymin": y1[index_list], "xmin": x1[index_list], "ymax": y2[index_list], "xmax": x2[index_list] }) endtime = time.time() #logging.info("Cluster spend {:.3f}".format(endtime-starttime)) return bing_windows
from __future__ import print_function import os, json, sys from google import Google from duckduckgo import Duckduckgo from bing import Bing from yahoo import Yahoo scrapers = { 'g': Google(), 'b': Bing(), 'y': Yahoo(), 'd': Duckduckgo(), } def read_in(): lines = sys.stdin.readlines() return json.loads(lines[0]) def small_test(): assert type(scrapers.google.results_search('fossasia')) is list def feedgen(query, engine): urls = scrapers[engine].results_search(query) result = urls print(result) print(len(result)) return result
class Detector(caffe.Net): """ Detector extends Net for windowed detection by a list of crops or selective search proposals. """ def __init__(self, model_file, pretrained_file, gpu=False, mean=None, input_scale=None, raw_scale=None, channel_swap=None, context_pad=None, weights_1st_stage_bing = None, sizes_idx_bing = None, weights_2nd_stage_bing = None, num_bbs_psz_bing = 130, num_bbs_final_bing = 1500): """ Take gpu, mean, input_scale, raw_scale, channel_swap: params for preprocessing options. context_pad: amount of surrounding context to take s.t. a `context_pad` sized border of pixels in the network input image is context, as in R-CNN feature extraction. """ caffe.Net.__init__(self, model_file, pretrained_file) self.set_phase_test() if gpu: self.set_mode_gpu() else: self.set_mode_cpu() if mean is not None: self.set_mean(self.inputs[0], mean) if input_scale is not None: self.set_input_scale(self.inputs[0], input_scale) if raw_scale is not None: self.set_raw_scale(self.inputs[0], raw_scale) if channel_swap is not None: self.set_channel_swap(self.inputs[0], channel_swap) self.configure_crop(context_pad) if bing_flag and not weights_1st_stage_bing is None and not sizes_idx_bing is None and not weights_2nd_stage_bing is None: self.bing = Bing(weights_1st_stage = weights_1st_stage_bing, sizes_idx = sizes_idx_bing ,weights_2nd_stage = weights_2nd_stage_bing, num_bbs_per_size_1st_stage= num_bbs_psz_bing, num_bbs_final = num_bbs_final_bing) else: self.bing = None def detect_bing(self, image): assert not self.bing is None if not bing_flag: print "Bing detection invoked but error while importing bing module!" sys.exit(1) t0 = time.time() bbs, scores = self.bing.predict(image) t1 = time.time() print "Bing prediction: {0:.2f}s.".format(t1-t0) images_windows = self.detect_windows(image, bbs) return self.get_predictions_from_cropped_images(images_windows) def detect_windows(self, image, bbs): """ Do windowed detection over given images and windows. Windows are extracted then warped to the input dimensions of the net. Take images_windows: (image filename, window list) iterable. context_crop: size of context border to crop in pixels. Give detections: list of {filename: image filename, window: crop coordinates, predictions: prediction vector} dicts. """ images_windows = [] image_fl = img_as_float(image) t0 = time.time() for bb in bbs: bb = np.array((bb[1],bb[0],bb[3],bb[2])) images_windows.append((self.crop(image_fl, bb), bb)) t1 = time.time() print "Bounding boxes cropping: {0:.2f}s.".format(t1-t0) return images_windows def get_predictions_from_cropped_images(self, images_windows): # Run through the net (warping windows to input dimensions). caffe_in = np.zeros((len(images_windows), images_windows[0][0].shape[2]) + self.blobs[self.inputs[0]].data.shape[2:], dtype=np.float32) bbs = [] for ix, (window_in, bb) in enumerate(images_windows): caffe_in[ix] = self.preprocess(self.inputs[0], window_in) bbs.append(bb) out = self.forward_all(**{self.inputs[0]: caffe_in}) predictions = out[self.outputs[0]].squeeze(axis=(2,3)) # Package predictions with images and windows. detections = [] ix = 0 for bb in bbs: detections.append({ 'window': bb, 'prediction': predictions[ix], }) ix += 1 return detections, predictions def crop(self, im, window): """ Crop a window from the image for detection. Include surrounding context according to the `context_pad` configuration. Take im: H x W x K image ndarray to crop. window: bounding box coordinates as ymin, xmin, ymax, xmax. Give crop: cropped window. """ # Crop window from the image. crop = im[window[0]:window[2], window[1]:window[3]] if self.context_pad: box = window.copy() crop_size = self.blobs[self.inputs[0]].width # assumes square scale = crop_size / (1. * crop_size - self.context_pad * 2) # Crop a box + surrounding context. half_h = (box[2] - box[0] + 1) / 2. half_w = (box[3] - box[1] + 1) / 2. center = (box[0] + half_h, box[1] + half_w) scaled_dims = scale * np.array((-half_h, -half_w, half_h, half_w)) box = np.round(np.tile(center, 2) + scaled_dims) full_h = box[2] - box[0] + 1 full_w = box[3] - box[1] + 1 scale_h = crop_size / full_h scale_w = crop_size / full_w pad_y = round(max(0, -box[0]) * scale_h) # amount out-of-bounds pad_x = round(max(0, -box[1]) * scale_w) # Clip box to image dimensions. im_h, im_w = im.shape[:2] box = np.clip(box, 0., [im_h, im_w, im_h, im_w]) clip_h = box[2] - box[0] + 1 clip_w = box[3] - box[1] + 1 assert(clip_h > 0 and clip_w > 0) crop_h = round(clip_h * scale_h) crop_w = round(clip_w * scale_w) if pad_y + crop_h > crop_size: crop_h = crop_size - pad_y if pad_x + crop_w > crop_size: crop_w = crop_size - pad_x # collect with context padding and place in input # with mean padding context_crop = im[box[0]:box[2], box[1]:box[3]] context_crop = caffe.io.resize_image(context_crop, (crop_h, crop_w)) crop = self.crop_mean.copy() crop[pad_y:(pad_y + crop_h), pad_x:(pad_x + crop_w)] = context_crop return crop def configure_crop(self, context_pad): """ Configure amount of context for cropping. If context is included, make the special input mean for context padding. Take context_pad: amount of context for cropping. """ self.context_pad = context_pad if self.context_pad: raw_scale = self.raw_scale.get(self.inputs[0]) channel_order = self.channel_swap.get(self.inputs[0]) # Padding context crops needs the mean in unprocessed input space. mean = self.mean.get(self.inputs[0]) if mean is not None: crop_mean = mean.copy().transpose((1,2,0)) if channel_order is not None: channel_order_inverse = [channel_order.index(i) for i in range(crop_mean.shape[2])] crop_mean = crop_mean[:,:, channel_order_inverse] if raw_scale is not None: crop_mean /= raw_scale self.crop_mean = crop_mean else: self.crop_mean = np.zeros(self.blobs[self.inputs[0]].data.shape, dtype=np.float32)
def create_directory_with_files_from_internet(target, query, file_types=[Bing.TXT_FILE_TYPE], dir_prefix='folder', levels=1, num_dir_per_level_created=1, num_files_per_dir_created=1, verbose=False, delay=False): """ Create a hierarchy of directories with files downloaded from the Internet, based on a query. :param target: target directory (root of the created hierarchy) :param query: query for retrieving the files :param file_types: a list of file types to download (splits the total number of files aprox. equally between all file types) :param dir_prefix: prefix for created directory names :param levels: teh depth of the directory hierarchy created :param num_dir_per_level_created: number of directories for each level in the hierarchy (same for all levels) :param num_files_per_dir_created: number of files in each directory created (same for all directories) :param verbose: print to console a message for each directory and file downloaded :param delay: not used :return: None """ if not levels or int(levels) <= 0: return if not num_dir_per_level_created or int(num_dir_per_level_created) <= 0: return if not num_files_per_dir_created or int(num_files_per_dir_created) <= 0: return # create target directory if it does not exist if not os.path.exists(target): os.makedirs(target) # make the naming pattern for file _, files_count_per_dir, total_files = make_file_pattern('', '', levels, num_dir_per_level_created, num_files_per_dir_created) # make the naming pattern for directory dir_name_pattern, dirs_count_per_level, total_dirs = make_dir_pattern(dir_prefix, levels, num_dir_per_level_created) if verbose: print 'creating directories (%d) and files (%d)' % (total_dirs, total_files) # recursive function to build each level of directories and files def create_level(current_target, urls, from_to_list, _num_levels, _current_level, _num_dirs_per_level, _num_files_per_dir, _current_dirs_count, show_details=None): _dirs_count = _current_dirs_count if _current_level <= 0: return _dirs_count for i in range(_num_dirs_per_level): dir_name = dir_name_pattern % (_num_levels - _current_level, i) dir_path = os.path.join(current_target, dir_name) # create directory try: os.makedirs(dir_path) _dirs_count += 1 # num_operations += 1 if show_details: show_details(dir_path) # prepare the files download list src_list = [] for i in range(_num_files_per_dir): src_list.append(urls.pop(0)) from_to_list.append({'src': src_list, 'dst': dir_path}) # recurse into the next level _dirs_count = create_level(dir_path, urls, from_to_list, _num_levels, _current_level - 1, _num_dirs_per_level, _num_files_per_dir, _dirs_count, show_details=show_details) except: pass return _dirs_count bing = Bing(KEY) urls = [] files_per_type = total_files / len(file_types) remainder = total_files - files_per_type * len(file_types) file_tuples = [] for ft in file_types: num = files_per_type + 1 if remainder > 0 else files_per_type file_tuples.append((ft, num)) remainder -= 1 result_total = 0 for file_tuple in file_tuples: result_per_file_type = 0 for url in bing.get_files(query, file_tuple[0], file_tuple[1]): urls.append(url) result_per_file_type += 1 print 'found %d (%d) results for %s type' % (result_per_file_type, file_tuple[1], file_tuple[0]) result_total += result_per_file_type print 'found %d results for querying "%s"' % (result_total, query) show_details = print_file_details if verbose else None # create the directory structure and prepare list of files to download from_to_list = [] d = create_level(target, urls, from_to_list, levels, levels, num_dir_per_level_created, num_files_per_dir_created, 0, show_details=show_details) show_download_details = print_file_download_details if verbose else None # parallel downloading of files in their respctive directories f = bing.execute2(from_to_list, display=show_download_details) global num_operations num_operations = d + f if verbose: print 'created %d directories with %d files' % (d, f)
from bs4 import BeautifulSoup as BS import logging import urllib, urllib2 import requests import ssl import chardet import re from geolcation import Geolocation from YoutubeSearch import YtubeSearch from bing import Bing b = Bing() print b.searchWeb('akb', {'location': 'JP'}) #jgeo = Geolocation() #print geo.lookup('129.97.224.225') ''' re.compile = r'/\[(.*?)\]/' url1 = "http://en.wikipedia.org/wiki/Tom_cruise" url2 = "http://en.wikipedia.org/wiki/Cat" #result = requests.get(url2) f = open('fake.html') soup = BS(f) cont = str(soup.p) print type(cont) print type(u'abc') print type(u'abc'.encode('ascii')) print type(u'abc'+'abc') soup = BS(result.text) ps = []
from bs4 import BeautifulSoup as BS import logging import urllib,urllib2 import requests import ssl import chardet import re from geolcation import Geolocation from YoutubeSearch import YtubeSearch from bing import Bing b = Bing() print b.searchWeb('akb',{'location':'JP'}) #jgeo = Geolocation() #print geo.lookup('129.97.224.225') ''' re.compile = r'/\[(.*?)\]/' url1 = "http://en.wikipedia.org/wiki/Tom_cruise" url2 = "http://en.wikipedia.org/wiki/Cat" #result = requests.get(url2) f = open('fake.html') soup = BS(f) cont = str(soup.p) print type(cont) print type(u'abc') print type(u'abc'.encode('ascii')) print type(u'abc'+'abc') soup = BS(result.text) ps = [] content = ''
def __init__(self): self.searchType = {SEARCH_IMAGE:self.__searchImage, SEARCH_WEB:self.__searchWeb} self.bing = Bing()