def ofa_specialized(net_id, pretrained=True): url_base = 'https://hanlab.mit.edu/files/OnceForAll/ofa_specialized/' net_config = json.load( open( download_url(url_base + net_id + '/net.config', model_dir='.torch/ofa_specialized/%s/' % net_id))) if net_config['name'] == ProxylessNASNets.__name__: net = ProxylessNASNets.build_from_config(net_config) elif net_config['name'] == MobileNetV3.__name__: net = MobileNetV3.build_from_config(net_config) else: raise ValueError('Not supported network type: %s' % net_config['name']) image_size = json.load( open( download_url(url_base + net_id + '/run.config', model_dir='.torch/ofa_specialized/%s/' % net_id)))['image_size'] if pretrained: init = torch.load(download_url(url_base + net_id + '/init', model_dir='.torch/ofa_specialized/%s/' % net_id), map_location='cpu')['state_dict'] net.load_state_dict(init) return net, image_size
def __init__(self, pkl_path=None, from_scratch=False, dim=(512, 512)): self.pkl_path = pkl_path self.dim = dim if self.pkl_path == None: ffhq_pkl = 'stylegan2-ffhq-config-f.pkl' ffhq_url = f'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/{ffhq_pkl}' empty_pkl = create_model(height=dim[0], width=dim[1]) if from_scratch: self.pkl_path = empty_pkl else: if not os.path.exists(ffhq_pkl): download_url(ffhq_url, ffhq_pkl) self.pkl_path = 'surgery.pkl' copy_weights(ffhq_pkl, empty_pkl, self.pkl_path) dnnlib.tflib.init_tf() print('Loading networks from "%s"...' % self.pkl_path) with dnnlib.util.open_url(self.pkl_path) as fp: self._G, self._D, self.Gs = pickle.load(fp) self.noise_vars = [ var for name, var in self.Gs.components.synthesis.vars.items() if name.startswith('noise') ]
def main(): excluded_venues = get_excluded_venue_ids(download_url(VENUES_URL)) for row in process( download_url(EVENTS_URL), excluded_venue_ids=excluded_venues): yield row
def get_data(): if not os.path.isfile(os.path.join(data_dir, "PROTEINS.zip")): #Needs Download url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/PROTEINS.zip' save_path = os.path.join(data_dir, 'PROTEINS.zip') utils.download_url(url, save_path) utils.unzip_file(os.path.join(data_dir, "PROTEINS.zip"))
def download(self): if self._check_datafile_exists(): print('# Found cached data {}, {}'.format(self.images_file, self.idx_file)) return if not self._check_downloaded(): # download files url = self.urls[self.name][0] filename = self.urls[self.name][1] md5 = self.urls[self.name][2] fpath = os.path.join(self.root, filename) download_url(url, self.root, filename, md5) print('# Extracting data {}\n'.format(self.data_down)) import zipfile with zipfile.ZipFile(fpath, 'r') as z: z.extractall(self.data_dir) os.unlink(fpath) # process and save as torch files print('# Caching data') images = read_image_file(self.data_dir, self.image_ext, self.lens[self.name]) points = read_info_file(self.data_dir, self.info_file) #refImg = read_interest_file(self.data_dir, self.interest_file) print('# Formatting data') #print(images.shape, len(points)) idx = [] i = 0 last = len(images) min_len = 100 while i < last: point = points[i] #print(i, last, point, points[i]) one_point = [] while i < last and points[i] == point: one_point.append(i) i += 1 #print(len(one_point)) if min_len > len(one_point): min_len = len(one_point) idx.append(one_point) print("minimal number of patches:", min_len) print("Saving to file") with open(self.images_file, 'wb') as f: torch.save(images, f) #print("Idx length:", len(idx)) with open(self.idx_file, 'wb') as f: pkl.dump(idx, f) print("Saved")
def download_unity_launcher(type): if not os.path.exists(".tmp"): os.mkdir(".tmp") download_url = artifactory_download_url if 'USE_UBERBUCKET' in os.environ: download_url = bokken_artifactory_cache_url utils.download_url("%s/tools/unity-launcher/UnityLauncher.%s.zip" % (download_url, type), ".tmp/UnityLauncher.%s.zip" % type)
def __init__(self, model_path): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' if not os.path.exists(model_path): print('Downloading semantic segmentation model') download_url(MODEL_URL, model_path) self.model = torch.load(model_path).to(self.device)
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) download_url(url, root=os.path.join(self.root, self.raw_folder), filename=filename, md5=None) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_set = ( read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte')) ) test_set = ( read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte')) ) indixes_train = np.argwhere(np.apply_along_axis(lambda x : x[0] in self.class_nums, 1, np.array(training_set[1]).reshape(-1, 1)) == 1).reshape(-1) indixes_test = np.argwhere(np.apply_along_axis(lambda x : x[0] in self.class_nums, 1, np.array(test_set[1]).reshape(-1, 1)) == 1).reshape(-1) if len(self.class_nums) == 2: nums = list(self.class_nums) training_set[1][indixes_train] = torch.LongTensor(np.where(training_set[1][indixes_train] == nums[0], -1, 1)) test_set[1][indixes_test] = torch.LongTensor(np.where(test_set[1][indixes_test] == nums[0], -1, 1)) training_set = (training_set[0][indixes_train], training_set[1][indixes_train]) test_set = (test_set[0][indixes_test], test_set[1][indixes_test]) with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def load_broad_repurposing_hub(path='./data'): url = 'https://dataverse.harvard.edu/api/access/datafile/4159648' if not os.path.exists(path): os.makedirs(path) download_path = os.path.join(path, 'broad.tab') download_url(url, download_path) df = pd.read_csv(download_path, sep='\t') df = df.fillna('UNK') return df.smiles.values, df.title.values, df.cid.values.astype(str)
def download(self, url=None, dest=None): if url: if not dest: dest = os.path.basename(url) print "\nDownloading build...\n" download_url(url, dest) #see utils module self.dest = dest return True else: return False
def load_IC50_1000_Samples(path='./data', n=100): print('Downloading...') url = 'https://dataverse.harvard.edu/api/access/datafile/4159681' if not os.path.exists(path): os.makedirs(path) download_path = os.path.join(path, 'IC50_samples.csv') download_url(url, download_path) df = pd.read_csv(download_path).sample( n=n, replace=False).reset_index(drop=True) return df['Target Sequence'].values, df['SMILES'].values
def get_covid_data(config, country_iso3, input_dir): # download covid data from HDX logger.info(f'Getting COVID data for {country_iso3}') download_dir = os.path.join(input_dir, COVID_DIR) Path(download_dir).mkdir(parents=True, exist_ok=True) covid_filename = os.path.join(download_dir,config['filename']) try: utils.download_url(config['url'], covid_filename) except Exception: logger.info(f'Cannot get COVID file from for {country_iso3}')
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) download_url(url, root=os.path.join(self.root, self.raw_folder), filename=filename, md5=None) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_set = (read_image_file( os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), read_label_file( os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))) test_set = (read_image_file( os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file( os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self, date=datetime.date.today(), dest=None): url = self.getBuildUrl(date) if url: if not dest: dest = os.path.basename(url) print "\nDownloading nightly from " + str(date) + "\n" download_url(url, dest) self.dest = dest return True else: return False
def load_antiviral_drugs(path='./data', no_cid=False): url = 'https://dataverse.harvard.edu/api/access/datafile/4159652' if not os.path.exists(path): os.makedirs(path) download_path = os.path.join(path, 'antiviral_drugs.tab') download_url(url, download_path) df = pd.read_csv(download_path, sep='\t') if no_cid: return df.SMILES.values, df[' Name'].values else: return df.SMILES.values, df[' Name'].values, df['Pubchem CID'].values
def download(self, date=datetime.date.today(), dest=None): url = self.getBuildUrl(date) if url: if not dest: dest = os.path.basename(url) print "\nDownloading nightly...\n" #TODO: doesn't belong here download_url(url, dest) self.dest = dest return True else: return False
def download(self, date=datetime.date.today(), dest=None): url = self.getBuildUrl(date) if url: if not dest: dest = os.path.basename(url) print "Downloading nightly from %s" % date self.remove_lastdest() download_url(url, dest) self.dest = self.lastdest = dest return True else: return False
def download(self): import tarfile if self._check_integrity(): print('Files already downloaded and verified') return download_url(self.url, self.root, self.filename, self.tgz_md5) # extract file with tarfile.open(os.path.join(self.root, self.filename), "r:gz") as tar: tar.extractall(path=self.root)
def test(self): url = "https://www.kaggle.com/account/login?ReturnUrl=/c/dstl-satellite-imagery-feature-detection/download/" filename = "sample_submission.csv.zip" expected_size = 15246 # 15246 kb filepath = os.path.join(os.getcwd(), filename) if os.path.exists(filepath): os.remove(filepath) utils.download_url(url + filename) self.assertTrue(os.path.exists(filepath)) self.assertEqual(expected_size, os.path.getsize(filepath))
def download(self, date=datetime.date.today(), dest=None): url = self.getBuildUrl(date) if url: if not dest: dest = os.path.basename(url) print "Downloading nightly from %s" % date if self.lastdest: os.remove(self.lastdest) download_url(url, dest) self.dest = self.lastdest = dest return True else: return False
def download(self, date=datetime.date.today(), dest=None): url = self.getBuildUrl(date) if url: if not dest: dest = self.get_destination(url, date) if not self.persist: self.remove_lastdest() self.dest = self.lastdest = dest download_url(url, dest) return True else: return False
def download(self): import zipfile if self._check_integrity(): print('Files already downloaded and verified') return filename = self._get_target_folder() zip_filename = filename + '.zip' url = self.download_url_prefix + '/' + zip_filename download_url(url, self.root, zip_filename, self.zips_md5[filename]) print('Extracting downloaded file: ' + join(self.root, zip_filename)) with zipfile.ZipFile(join(self.root, zip_filename), 'r') as zip_file: zip_file.extractall(self.root)
def run(self): for ii, filename in enumerate(self.output()): filename.makedirs() url = f'https://www.futhead.com/18/nations/?page={ii+1}' page = utils.download_url(url) with open(filename.path, 'wb') as outfile: outfile.write(page)
def run(self): url = 'https://fixturedownload.com/download/fifa-world-cup-2018-RussianStandardTime.csv' page = utils.download_url(url) self.output().makedirs() filename = self.output().path with open(filename, 'wb') as outfile: outfile.write(page)
def run(self): url = f'http://www.football-data.co.uk/mmz4281/{self.season}/{self.league}.csv' page = utils.download_url(url) self.output().makedirs() filename = self.output().path with open(filename, 'wb') as outfile: outfile.write(page)
def run(self): base_url = 'https://www.fifaindex.com/teams/{}/?type=1' for ii, outpath in enumerate(self.output()): url = base_url.format(ii + 1) page = utils.download_url(url) outpath.makedirs() with open(outpath.path, 'wb') as f: f.write(page)
def run(self): self.output().makedirs() url = 'https://www.oddschecker.com/football/world-cup#outrights' page = utils.download_url(url) self.output().makedirs() filename = self.output().path with open(filename, 'wb') as outfile: outfile.write(page)
def download(self): import tarfile if self._check_integrity(): print('Files already downloaded and verified') return root = self.root download_url(self.url, root, self.filename, self.tgz_md5) # extract file cwd = os.getcwd() tar = tarfile.open(os.path.join(root, self.filename), "r:gz") os.chdir(root) tar.extractall() tar.close() os.chdir(cwd)
def get_input_file_path(self): if not self.input_file_path: remote_path = get_signed_url(self.resource_path, self.bucket) self.input_file_path = download_url( remote_path, app.config['UPLOAD_FOLDER'], target_filename=os.path.basename(self.resource_path), timestamp=True) return self.input_file_path
def locate_links(self): """download all torrents and get xpaths of torrent links""" links = {} loop = 0 for url in self.urls_torrent_page: self.logger.info("Buscando enlaces en %s [%d/%d]"%(url, loop, len(self.urls_torrent_page))) loop += 1 xpath = XPath(url) if xpath: data = {} #extrae datos que tambien vienen dentro del torrent for k in ['size', 'infohash', 'title']: if k in self.metas: xps = self.metas[k]['all'] sorted_xp = reversed(sorted(xps.iteritems(), key=operator.itemgetter(1))) for xp_tuple in sorted_xp: xp = xp_tuple[0] extract = xpath.extract(xp) #~ print "\t", k, extract, xp rt = is_valid_meta(extract, k) if rt: data[k] = extract if k == "title" else rt break #y se asegura de que coincidan for url_torrent, xp in xpath.get_xpath_torrents().items(): url_torrent = urljoin(self.base_url, url_torrent) #~ print "@"*22 #~ print url_torrent, xp tr = download_url(url_torrent, force = True) #, verbose = True) if tr: #~ print tr #~ print len(tr) try: info = torrent_info(tr) except: continue if is_same_torrent(data, info): #, verbose = True): if not xp in links: links[xp] = 0 #~ print xp links[xp] += 1 data[url] = links self.metas['links'] = {} self.metas['links']['all'] = {k:v for k,v in links.items() if v>(max(v for v in links.values()) / 3)}
def locate_torrent_pages(self): #Recorre el sitio en busca de las páginas de fichero while len(self.urls_torrent_page ) < self.num_candidates and self.urls_to_see: url = self.urls_to_see.pop() if not url.startswith("http"): url = "%s%s%s" % (self.base_url, "" if url.startswith("/") else "/", url) if url in self.urls_visited: continue self.logger.info(u"Recorriendo %s" % url) html = download_url(url) self.add_urls(html) if self.is_torrent_page(html, url): self.urls_torrent_page[url] = 1 self.logger.info("torrent page localizada (%d)" % len(self.urls_torrent_page)) self.urls_visited.append(url) if len(self.urls_visited) % 100 == 0: self.logger.info( "%d pages visited, %d torrents page located" % (len(self.urls_visited), len(self.urls_torrent_page))) if len(self.urls_visited) > 20000 or ( len(self.urls_visited) > 1000 and len(self.urls_torrent_page) < (len(self.urls_visited) / 1000)): return False #Con calma para evitar baneos time.sleep(3) if len(self.urls_torrent_page) < self.num_candidates: self.logger.info(len(self.urls_torrent_page)) self.logger.info(self.urls_to_see) return False #~ raise Exception("no se han encontrado candidatas") if not self.all_torrent_pages_ok(): self.locate_torrent_pages() #Guarda en db self.db_conn.torrents.domain.update( {"_id": self.get_id()}, {"$set": { "tp": self.urls_torrent_page.keys() }}, upsert=True) return self.urls_torrent_page
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" if self._check_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) # download files for url in self.urls: filename = url.rpartition('/')[2] file_path = os.path.join(self.raw_folder, filename) download_url(url, root=self.raw_folder, filename=filename, md5=None) self.extract_gzip(gzip_path=file_path, remove_finished=True)
def run(self): fifa_season = utils.translate_season_to_fifa(self.season) league_int = utils.translate_league(self.league) url = f'https://www.fifaindex.com/teams/{fifa_season}_{self.match_day}/?league={league_int}' outpath = self.output() page = utils.download_url(url) outpath.makedirs() with open(outpath.path, 'wb') as f: f.write(page)
def __init__(self, pkl_path=None, from_scratch=False, dim=(512, 512), from_dir=None, cond=False, label_size=0): self.pkl_path = pkl_path self.dim = dim if self.pkl_path is None and from_dir is None: ffhq_pkl = 'stylegan2-ffhq-config-f.pkl' ffhq_url = f'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/{ffhq_pkl}' empty_pkl = create_model(height=dim[0], width=dim[1], cond=cond, label_size=label_size) if from_scratch: self.pkl_path = empty_pkl else: if not os.path.exists(ffhq_pkl): download_url(ffhq_url, ffhq_pkl) self.pkl_path = 'surgery.pkl' copy_weights(ffhq_pkl, empty_pkl, self.pkl_path) if from_dir: curr_best = 0 for pkl_file in glob.glob(f'{from_dir}/*.pkl'): ckpt_number = int(pkl_file.split('-')[-1][:-4]) if curr_best < ckpt_number: curr_best = ckpt_number self.pkl_path = pkl_file dnnlib.tflib.init_tf() print('Loading networks from "%s"...' % self.pkl_path) with dnnlib.util.open_url(self.pkl_path) as fp: self._G, self._D, self.Gs = pickle.load(fp) self.noise_vars = [ var for name, var in self.Gs.components.synthesis.vars.items() if name.startswith('noise') ]
def download(self) -> None: """Download the QMNIST data if it doesn't exist in processed_folder already. Note that we only download what has been asked for (argument 'what'). """ if self._check_exists(): return os.makedirs(self.raw_folder, exist_ok=True) os.makedirs(self.processed_folder, exist_ok=True) split = self.resources[self.subsets[self.what]] files = [] # download data files if not already there for url, md5 in split: filename = url.rpartition('/')[2] file_path = os.path.join(self.raw_folder, filename) if not os.path.isfile(file_path): download_url(url, root=self.raw_folder, filename=filename, md5=md5) files.append(file_path) # process and save as torch files print('Processing...') data = read_sn3_pascalvincent_tensor(files[0]) assert (data.dtype == torch.uint8) assert (data.ndimension() == 3) targets = read_sn3_pascalvincent_tensor(files[1]).long() assert (targets.ndimension() == 2) if self.what == 'test10k': data = data[0:10000, :, :].clone() targets = targets[0:10000, :].clone() if self.what == 'test50k': data = data[10000:, :, :].clone() targets = targets[10000:, :].clone() with open(os.path.join(self.processed_folder, self.data_file), 'wb') as f: torch.save((data, targets), f)
def process(html_fobj): for row in get_all_listings(html_fobj): event_page = download_url(row['url']) if not is_single_event(event_page): L.info("Ignoring repeated event '{}'".format(row['headline'])) continue event_page.seek(0) if is_child_event(event_page): L.info("Ignoring children's film '{}'".format(row['headline'])) continue # TODO: row['description'] = parse_description(event_page) yield row
def locate_torrent_pages(self): #Recorre el sitio en busca de las páginas de fichero while len(self.urls_torrent_page) < self.num_candidates and self.urls_to_see: url = self.urls_to_see.pop() if not url.startswith("http"): url = "%s%s%s"%(self.base_url, "" if url.startswith("/") else "/", url) if url in self.urls_visited: continue self.logger.info(u"Recorriendo %s"%url) html = download_url(url) self.add_urls(html) if self.is_torrent_page(html, url): self.urls_torrent_page[url] = 1 self.logger.info("torrent page localizada (%d)"%len(self.urls_torrent_page)) self.urls_visited.append(url) if len(self.urls_visited) % 100 == 0: self.logger.info("%d pages visited, %d torrents page located" % (len(self.urls_visited), len(self.urls_torrent_page))) if len(self.urls_visited) > 20000 or (len(self.urls_visited) > 1000 and len(self.urls_torrent_page) < (len(self.urls_visited)/1000)): return False #Con calma para evitar baneos time.sleep(3) if len(self.urls_torrent_page) < self.num_candidates: self.logger.info(len(self.urls_torrent_page)) self.logger.info(self.urls_to_see) return False #~ raise Exception("no se han encontrado candidatas") if not self.all_torrent_pages_ok(): self.locate_torrent_pages() #Guarda en db self.db_conn.torrents.domain.update({"_id":self.get_id()},{"$set":{"tp":self.urls_torrent_page.keys()}},upsert=True) return self.urls_torrent_page
def download_file(self, mongodb_version, mongodb_edition, destination=None): destination or os.getcwd() url = self.get_download_url(mongodb_version, mongodb_edition) response = urllib.urlopen(url) if response.code == 404: raise FileNotInRepoError("File not found in repo") if response.getcode() != 200: msg = ("Unable to download from url '%s' (response code '%s'). " "It could be that version '%s' you specified does not exist." " Please double check the version you provide" % (url, response.getcode(), mongodb_version)) raise MongoctlException(msg) return download_url(url, destination, show_errors=not is_interactive_mode())
def setupTests(self): zippedTests = download_url(getTestUrl(),dest=str(os.path.join(self.shellCacheDir,"tests.zip"))) unzip(self.testDir,zippedTests)
def get_metas(self): self.logger.info("Busqueda de metadatos") data = {} l = 0 if not self.urls_torrent_page: return False for url in self.urls_torrent_page: self.logger.info("[%d/%d]Extrayendo cadenas de %s"%( l, len(self.urls_torrent_page), url)) l += 1 try: html = download_url(url) doc = BeautifulSoup(html) except TypeError: self.logger.warning("No se ha podido cargar %s"%url) continue if not self.is_torrent_page(html, url): del self.urls_torrent_page[url] self.logger.warning("No es torrent page") return None if doc is None or doc.body is None: del self.urls_torrent_page[url] self.logger.warning("El doc obtenido no es valido") return None strings = [] for string in doc.body.stripped_strings: strings.append(string) data[url] = strings map_str = [] for pos in xrange(0, len(strings)): self.logger.info("Analizando [%d/%d]"%(pos, len(strings))) equal = True first = True previous = None for url in data: if previous is None: previous = url else: try: if len(data[url]) < pos or len(data[previous]) < pos or (data[url][pos] != data[previous][pos]): if first: first = False continue equal = False break; except IndexError: if first: first = False continue equal = False break; map_str.append(equal) #~ data = {} rt = {} duplicated = [] self.logger.info("Busqueda atributos") #busqueda de atributos para cada url for url in self.urls_torrent_page: #~ self.logger.info("Procesando cadenas %s"%url) pos = 0 last_equal = True metadata = {"infohash" : None, "size" : None, "description" : None, "title" : None, "category" : None, "tags" : None, "quality" : None, "genre" : None, #~ "series" : None, "season" : None, "episode" : None, "language" : None} pos = 0 last_equal = True next_is_description = False doc = BeautifulSoup(download_url(url)) try: h1 = doc("h1")[0].stripped_strings.next() if len(doc("h1"))> 0 and is_title(h1, url, full = True): metadata['title'] = h1 else: h2 = doc("h2")[0].stripped_strings.next() if doc("h2") and is_title(h2, url, full = True): metadata['title'] = h2 except IndexError: pass #~ print metadata['title'] meta = {} for equal in map_str: #Lo que parecen metadatos que ofrece la página if not equal and last_equal: #~ print pos, "[%s]"%data[self.urls_torrent_page[0]][pos-1], data[self.urls_torrent_page[0]][pos] if pos>0: prev = data[url][pos-1] token = data[url][pos] if len(prev) < 20 and not prev.isnumeric() and len(prev) > 2 and prev.count(" ")<2: meta[prev.replace(":","").lower()] = token pos += 1 last_equal = equal for m in metadata: if m in meta: metadata[m] = meta[m] pos = 0 last_equal = True #~ print url #~ print metadata #~ print "*"*44 #Busqueda en bruto already_title = False search_tags = True for equal in map_str: if pos < len(data[url]): token = data[url][pos] #~ print token #Hasta que no aparece el titulo no empieza a buscar if not already_title: if not(len(token) > 5 and is_title(token.lower(), url, full = True)): pos += 1 continue already_title = True #~ print "*****************" #La descripción suele estar al final ending = ["comment", "related", "similar"] if any([w in token.lower() for w in ending]) and pos > len(data[url]) * 0.4: #~ print pos, len(data[url]) #nada interesante despues de los comentarios o archivos relacionados #~ exit() #~ print token #~ print "*********ENDING***************" break if search_tags: if any([w in token.lower() for w in ending]): search_tags = False if pos > (len(map_str) * 0.75): #Esto es el final de la página y ya no hay nada interesante #~ print token #~ print "*********ENDING LARGE***************" break if is_script(token): pos += 1 last_equal = equal continue if not equal: if search_tags: tag = is_tag(token) if tag: if metadata['tags'] is None or isinstance(metadata['tags'], basestring): metadata['tags'] = {} if not tag in metadata['tags']: try: metadata['tags'][tag] = [] except: print metadata print metadata['tags'] print tag raise if not token in metadata['tags'][tag]: metadata['tags'][tag].append(token) if metadata['description'] is None or len(metadata['description']) < len(token): #busca descripciones cortas salvo que las haya más largas if len(token) > 100 or next_is_description: #~ print url #~ print "--------------" #~ print "token",token #~ print "desc_candidate", desc_candidate if not equal and not is_script(token) and is_description(token): metadata['description'] = token #~ print url #~ print "metadata['description']", metadata['description'] next_is_description = False if "description" in token: next_is_description = True if metadata['title'] is None: if len(token) > 5 and is_title(token, url): metadata['title'] = token if metadata['season'] is None or metadata['episode'] is None: if len(token) > 3: se = is_season_episode(token) if se: metadata['season'] = {"token" : token, "value" : se['s']} metadata['episode'] = {"token" : token, "value" : se['e']} if metadata['size'] is None: if len(token) > 2: z = is_size(token) if z: metadata['size'] = {"token" : token, "value":z} if metadata['infohash'] is None: if "hash" in token or len(token) == 40: metadata['infohash'] = extract_infohash(token) #~ if metadata['category'] is None: #~ if is_category(token): #~ print url #~ print "\t\t\t", token #~ print get_xpath_from_soup_object(token) #~ metadata['category'] = token if metadata['language'] is None: if is_language(token): metadata['language'] = token pos += 1 last_equal = equal #~ print #~ print #~ print url #~ print metadata #~ print "--" #~ print url #~ if not metadata['tags'] is None: #~ print metadata['tags'] #~ self.logger.info("Extrayendo xpaths %s"%url) xpath = XPath(url) if not xpath: del self.urls_torrent_page[xpath] self.logger.warning("No se pueden xpathear %s" % url) return None _metadata = {} for m, v in metadata.items(): #~ print m, v if v: if type(v) == type({}): if not "value" in v: value = ",".join([",".join(keywords) for keywords in v.values()]) token = value.split(",") #~ print token else: value = v['value'] token = v['token'] else: value = v token = v def extract_token(token, value): #~ print "extrayendo %s - %s"%(token, value) try: xp = xpath.get_xpath(u(token)) except UnicodeDecodeError: #~ print "unicode" return False if xp is None: #~ print "xp" return False #~ print "XP", xp extract = xpath.extract(xp) if not extract: #~ print "no extract", token, xp return False #~ print ".." #~ print element_2_str(extract) #~ print "TOKEN-EXTRACT", token.strip(), extract.strip() if len(extract) > 0 or not token.strip() == extract.strip(): #~ print "[%s][%s]"%(token, extract[0]) ok = True if xpath.last_expansive: ok = False try: if token.strip() in element_2_str(extract): #es correcto, probablemente la descripcion ok = True else: #~ self.logger.warning("else last_expansive") return False except: return False if not ok: if not token.strip() in extract.strip(): print("No coincide %s[%s] para %s"%(xp, extract, token)) self.logger.error("No coincide %s[%s] para %s"%(xp, extract, token)) raise Exception("Incoherencia xpath") #~ self.logger.warning("No se puede extrar el xpath de %s en %s"%(token, url) ) return False id_m = m if m == "tags": tg = is_tag(value) if tg: id_m = tg.split("_")[0] if id_m in _metadata and _metadata[id_m]['xpath'] != xp and id_m != "category": duplicated.append(id_m) #Para el metadata "language" no se guardan xpath de enlaces ya que suelen ser selectores de idioma de la página if id_m == "language" and "/a/" in xp: return False #h1 solo para el title if id_m != "title" and "/h1" in xp: return False if "'tab-main'" in xp and id_m == "subcategory": print print print print url print id_m print _metadata print value print xp exit() #No se guarda nada que cuelgue de comentarios, script o style invalid = ["comment", "script", "style", "select"] if not any([w in xp for w in invalid]) : _metadata[id_m] = {"value" : value, "xpath" : xp} #~ print "_METADATA", _metadata if type(token) == type([]): for t in token: #~ print "\t"+t extract_token(t, t) else: extract_token(token, value) #evita confundir description con title if "description" in _metadata and "title" in _metadata: if _metadata['title']['xpath'] == _metadata['description']['xpath']: del _metadata['description'] #~ print "%s:%s"%(url, _metadata) #~ exit() for d in duplicated: if d in _metadata: del _metadata[d] duplicated = [] rt[url] = { "metadata" : _metadata, "meta" : meta} #~ pprint(rt[url]['metadata']) #~ exit() #~ print #~ print #~ print #~ print "**********************" #~ print url #~ print rt[url]['metadata'] #~ print "**********************" #~ for url, d in rt.items(): #~ print url #~ for k,v in d['metadata'].items(): #~ if not v is None: #~ print "%s: {%s:%s}"%(k, v['value'], v['xpath']) #~ #~ print #~ for k, v in d['meta'].items(): #~ if not v is None: #~ print "[%s] "%(k) #~ print #~ print "**************" metas_ocurrences = {} for url, d in rt.items(): #~ print url for k,v in d['metadata'].items(): #~ print k, v if not k in metas_ocurrences: metas_ocurrences[k] = {} value = v['xpath'] if not value in metas_ocurrences[k]: metas_ocurrences[k][value] = 0 metas_ocurrences[k][value] += 1 #~ print "ocurrences" #~ print metas_ocurrences metas = {} for m, xpaths in metas_ocurrences.items(): for xpath, count in xpaths.items(): #~ if count > (self.num_candidates / 10): #~ if count > 0: if not m in metas: metas[m] = {} metas[m]["all"] = {} if not xpath in metas[m]["all"]: metas[m]["all"][xpath] = count #~ metas[m]["all"] = "(%d)%s"%(count, xpath) #busca en los metas "debiles" y los elimina si no está seguro de que son correctos for weak in self.weak_metas: if weak in metas: ok = False sum_counts = sum(metas[weak]['all'].values()) for count in metas[weak]['all'].values(): if count > (sum_counts / 3): ok = True if not ok: del metas[weak] self.metas = metas return True
def upload(): # Get priority priority = int(request.form.get('priority', PRIORITY.medium)) if priority not in PRIORITY.get_values(): priority = PRIORITY.medium # Get output formats output_formats = request.form.get('output-formats', '') output_formats = list(set( filter( lambda format: format in app.config['ALLOWED_EXTENSIONS'], output_formats.split(';') ) )) if not output_formats: return jsonify({'Error': 'Must provide valid output formats'}), 400 # Get file (either directly or via URL) file = request.files.get('file') allowed_extensions = app.config['ALLOWED_EXTENSIONS'] if file: if allowed_filename(file.filename, allowed_extensions): filename = secure_filename(file.filename).strip()[-FILE_NAME_LIMIT] local_path = os.path.join(app.config['UPLOAD_FOLDER'], timestamp_filename(filename)) file.save(local_path) else: return jsonify({'Error': 'File format not allowed'}), 400 else: fileURL = request.form.get('fileURL') if fileURL: filename = get_filename_from_url(fileURL) try: local_path = download_url( fileURL, app.config['UPLOAD_FOLDER'], timestamp=True) except FileAccessDenied as fad: return jsonify({ 'status': 'error', 'code': fad.status_code, 'message': fad.message }), 500 else: return jsonify({'status': 'error', 'message': 'Unable to decode uploaded file'}), 500 # Upload to remote and remove file from local remote_destination = os.path.join(app.config['REMOTE_INPUT_FOLDER'], get_uuid(), filename) upload_to_remote(remote_destination, local_path) os.remove(local_path) # Register the file for conversions and return docIds docIds = Conversion.register_file(filename, remote_destination, g.user, output_formats, priority) # Call request fetcher request_fetcher.delay() return jsonify({'status': STATUS.introduced, 'doc_ids': docIds})
def get_image(self, mode = 0): #----------------- #Valores para mode. Se utiliza para considerar una imagen o no candidata por el tamaño #0 -> width y height > 100 #1 -> width y height >= 100 #2 -> width o height > 100 #3 -> width o height >= 100 if mode>3: return None data = {} loop = 0 blacklist = ["avatar", "promo", "category", "categories", "user", "ads"] domain = self.get_id() images = {} for url in self.urls_torrent_page: self.logger.info("Analizando %s en busca de imagenes [%d/%d]"%(url, loop, len(self.urls_torrent_page))) loop += 1 #~ if loop > 10: #~ break doc = BeautifulSoup(download_url(url)) imgs = [] xpath = XPath(url) title = None if xpath: if 'title' in self.metas: for xp in self.metas['title']['all']: extract = xpath.extract(xp) rt = is_title(extract, url) if rt: title = extract for img in doc("img"): #~ print title, img.get("alt"), img.get("src") if img.get("alt") and title and title in img.get("alt"): images[url] = {"img":img.get("src"), "xpath":xpath.get_xpath_img(clean_url_img(img.get("src"), domain))} self.logger.info("Imagen localizada %s"%images[url]) #the one imgs = [img.get("src")] break imgs.append(img.get("src")) data[url] = imgs commons = [] for url in data: for img in data[url]: if not img in images: images[img] = 0 images[img] += 1 for img, count in images.items(): if count>(len(self.urls_torrent_page) / 2): commons.append(img) images = {} loop = 0 for url in self.urls_torrent_page: self.logger.info("Buscando la principal en %s [%d/%d]"%(url, loop, len(self.urls_torrent_page))) loop += 1 #~ if loop > 10: #~ break img_candidates = {} for img in data[url]: if not img in commons and not ".." in img and not any([w in img.lower() for w in blacklist]): try: self.logger.info("Salvando temporal de %s" % img) im = Image.open(save_tmp(img, domain)) width, height = im.size print img, width, height #ver comentario al principio de la funcion para el comportamiento de mode if mode == 0: if width > 100 and height > 100: img_candidates[img] = "%sx%s"%(width, height) if mode == 1: if width >= 100 and height >= 100: img_candidates[img] = "%sx%s"%(width, height) if mode == 2: if width > 100 or height > 100: img_candidates[img] = "%sx%s"%(width, height) if mode == 3: if width >= 100 or height >= 100: img_candidates[img] = "%sx%s"%(width, height) except IOError as e: self.logger.error("IOError %s: %s "%(e, img)) pass #~ print img_candidates #Si se repiten tamaño se anulan def size_equal(s1, s2): ss1 = s1.split("x") ss2 = s2.split("x") return abs(int(ss1[0])-int(ss2[0])) < 3 and abs(int(ss1[1])-int(ss2[1])) < 3 #~ print "*********" #~ pprint(img_candidates) no_candidates = [] for img in img_candidates: size = img_candidates[img] for img2 in img_candidates: if img != img2 and size_equal( size, img_candidates[img2]): no_candidates.append(img) for no in no_candidates: if no in img_candidates: del img_candidates[no] #~ pprint.pprint(img_candidates) no_candidates = [] if len(img_candidates) > 1: #Intenta excluir las que no son del propio dominio for img in img_candidates: if not domain in img: no_candidates.append(img) for no in no_candidates: if no in img_candidates: del img_candidates[no] images[url] = None if len(img_candidates) == 1: #Imagen obtenida xpath = XPath(url) img = clean_url_img(img_candidates.keys()[0], domain) images[url] = {"img":img, "xpath":xpath.get_xpath_img(img)} self.logger.info("Imagen localizada %s"%images[url]) xpaths = {} #Recuenta todos los xpaths que han aparecido for img, v in images.items(): if v is None: continue xpath = v['xpath'] if xpath: #ignora logos if 'logo' in xpath: continue if not xpath in xpaths: xpaths[xpath] = 0 xpaths[xpath] += 1 #Se queda con el más comun si aparece un minimo de veces current_xpath = None max_count = 0 for xpath, count in xpaths.items(): #~ print count, max_count if count > max_count: max_count = count current_xpath = xpath #~ print current_xpath #~ print max_count , (len(self.urls_torrent_page) / 10) #~ if max_count >= (len(self.urls_torrent_page) / 10): if max_count >= 1: #Un xpath suficiente self.metas['image'] = {"candidate":current_xpath, "all":xpaths} return current_xpath return self.get_image(mode + 1)
def get_category(self): if 'category' in self.metas and 'all' in self.metas['category'] and sum(v for v in self.metas['category']['all'].values()) > (len(self.urls_torrent_page)/3): print "****************************" print self.metas['category'] print "ya tiene" return True if not self.urls_torrent_page: return False blacklist = ["user", "download", ".torrent","magnet", "api", "about", "privacy", "register", "contact", "recover" , "latest", "popular", "request", "rss", "faq"] data = {} for url in self.urls_torrent_page: #~ print url doc = BeautifulSoup(download_url(url)) #~ print "cargando doc", url links = [] #Busca enlaces que parezcan categorias for link in doc("a"): href = link.get("href") if href is None: continue if href.startswith("/") and not href.startswith("//"): href = "/".join(url.split("/")[:3]) + href if not any([w in href.lower() for w in blacklist]) and href.startswith("/".join(url.split("/")[:3])) and href != "/".join(url.split("/")[:3]): if not href in links and is_category(link.string): xp = get_xpath_from_soup_object(link) links.append((href, link.string, xp)) data[url] = links map_links = {} for url, links in data.items(): #~ print url, links pos = 0 for link in links: #~ print link if not pos in map_links.keys(): map_links[pos] = [] _id = "%s|||%s|||%s" % link if not _id in map_links[pos]: map_links[pos].append(_id) pos += 1 #~ print "\t:%s, %s, %s"%link xp = None for pos, xpaths in map_links.items(): #~ print pos, len(xpaths), xpaths if len(xpaths) > 1: xp = xpaths[0].split("|||")[-1] break if not xp: xpath_cat = {} for url in self.urls_torrent_page: #Lo intenta buscando breadcrumb _doc = BeautifulSoup(download_url(url)) _xpath = XPath(url) next_cat = False for string in _doc.stripped_strings: if next_cat: if is_category(string): _xp = _xpath.get_xpath(string) if not _xp in xpath_cat: xpath_cat[_xp] = 0 xpath_cat[_xp] += 1 next_cat = False if u">" in string and not u"<" in string or u"»" in string and not is_script(string): next_cat = True for _xp in xpath_cat: if xpath_cat[_xp] > (len(self.urls_torrent_page) * 0.75): xp = _xp if not xp: #Lo intenta en la url pos = 0 for url_part in url.split("/"): if (is_category(url_part)): #~ print "category(url)", url_part xp = "@url[%d]"%pos pos += 1 #~ print "..........." #~ print url.split("/")[2] #~ print url #~ print xp _all = {} for url in self.urls_torrent_page: if not xp is None: if "@url" in xp: if not xp in _all: _all[xp] = 0 _all[xp] += 1 #~ self.metas['category'] = xp #~ return xp try: extract = XPath(url).extract(xp) except: continue if len(extract) > 0 and is_category(extract): if not xp in _all: _all[xp] = 0 _all[xp] += 1 #~ self.metas['category'] = xp #~ print self.metas['category'] #~ exit() #~ return xp if not "category" in self.metas: self.metas['category'] = {} if not "all" in self.metas['category']: self.metas['category']['all'] = {} for a in _all: #Si ha encontrado la mayoria con este metodo elimina el resto if _all[a] > (len(self.urls_torrent_page) * 0.6): self.metas['category']['all'] = {} if not a in self.metas['category']['all']: self.metas['category']['all'][a] = 0 self.metas['category']['all'][a] += _all[a] #Si hay alguno con @url solo vale ese for a in self.metas['category']['all']: if "@url" in a: count = self.metas['category']['all'][a] self.metas['category']['all'] = {} self.metas['category']['all'][a] = count return _all
def main(): for row in process(download_url(URL)): yield row
def bisectRecurse(self, testcondition=None, args_for_condition=[]): #Recursively build, run, and prompt verdict = "" current_revision = captureStdout(self.hgPrefix+["id","-i"]) if self.remote: print "on current revision "+current_revision print "This would ask for a remote changeset, but it's not implemented yet." #TODO: #Remote bisection! #Step 1. Check if revision is in the archive #Step 2. If revision is not in the archive, set remote=False and continue (it will build and bisect that revision) #if not check_archived: # set remote false and continue #else: #Step 3. If the revision is in the archive, download it and its corresponding tests #STEP3 #1. Extract tests into some directory #2. Extract Nightly.app into "tests" #MozInstaller(src=, dest="", dest_app="Nightly.app") #3. run the following: #test_command = ['python', 'mochitest/runtests.py', '--appname=./Nightly.app/Contents/MacOS/firefox-bin', '--utility-path=bin', '--extra-profile-file=bin/plugins', '--certificate-path=certs', '--autorun', '--close-when-done', '--console-level=INFO', '--test-path=test_name'] #output = captureStdout(test_command, ignoreStderr=True) #set verdict based on output #python mochitest/runtests.py --appname=./Nightly.app/Contents/MacOS/firefox-bin --utility-path=bin --extra-profile-file=bin/plugins --certificate-path=certs --autorun --close-when-done --console-level=INFO --test-path=test_name #example test name: Harness_sanity/test_sanityException.html #Step 4. Run and run test to get verdict #Step 5. Set verdict elif self.tryPusher: try: caller = BuildCaller(host=self.tryhost, port=int(self.tryport), data=current_revision) print "Getting revision "+current_revision+"..." except: print "Failed to connect to trypusher. Make sure your settings are correct and that the trypusher server was started." exit() response = caller.getChangeset() print "Waiting on Mozilla Pulse for revision " + response + "..." url = caller.getURLResponse(response) print "the base is " +url_base(url) #Download it here #1. Download from url, extract to same place as tests #2. Run test or start browser. binary_path = os.path.join(self.binaryDir,url_base(url)) downloaded_binary = download_url(url, dest=str(binary_path)) MozInstaller(src=str(binary_path), dest=str(self.testDir), dest_app="Nightly.app") #now nightly is installed in if sys.platform == "darwin": binary_path = os.path.join(self.testDir,"Nightly.app") runner = FirefoxRunner(binary=os.path.join(binary_path,"Contents","MacOS")+"/firefox-bin") elif sys.platform == "linux2": binary_path = os.path.join(self.testDir,"firefox") runner = FirefoxRunner(binary=binary_path) elif sys.platform == "win32" or sys.platform == "cygwin": binary_path = os.path.join(self.testDir,"firefox.exe") runner = FirefoxRunner(binary=binary_path) else: print "Your platform is not currently supported." quit() dest = runner.start() if not dest: print "Failed to start the downloaded binary" verdict == "skip" runner.wait() if verdict == "skip": pass elif testcondition!=None: #Support condition scripts where arg0 is the directory with the binary and tests args_to_pass = [self.testDir] + args_for_condition if hasattr(testcondition, "init"): testcondition.init(args_to_pass) #TODO: refactor to use directories with revision numbers #8.2.11 - revision number can now be found in current_revision variable tmpdir = tempfile.mkdtemp() verdict = testcondition.interesting(args_to_pass,tmpdir) #Allow user to return true/false or bad/good if verdict != "bad" and verdict != "good": verdict = "bad" if verdict else "good" else: try: self.build() except Exception: print "This build failed!" verdict = "skip" if verdict == "skip": pass elif testcondition==None: #Not using a test, interactive bisect begin! self.run() else: #Using Jesse's idea: import any testing script and run it as the truth condition args_to_pass = [self.objdir] + args_for_condition if hasattr(testcondition, "init"): testcondition.init(args_to_pass) #TODO: refactor to use directories with revision numbers #8.2.11 - revision number can now be found in current_revision variable tmpdir = tempfile.mkdtemp() verdict = testcondition.interesting(args_to_pass,tmpdir) #Allow user to return true/false or bad/good if verdict != "bad" and verdict != "good": verdict = "bad" if verdict else "good" while verdict not in ["good", "bad", "skip"]: verdict = raw_input("Was this commit good or bad? (type 'good', 'bad', or 'skip'): ") if verdict == 'g': verdict = "good" if verdict == 'b': verdict = "bad" if verdict == 's': verdict = "skip" # do hg bisect --good, --bad, or --skip verdictCommand = self.hgPrefix+["bisect","--"+verdict] print " ".join(verdictCommand) retval = captureStdout(verdictCommand) string_to_parse = str(retval) print string_to_parse self.check_done(string_to_parse) if retval.startswith("Testing changeset"): print "\n" self.bisectRecurse(testcondition=testcondition, args_for_condition=args_for_condition)