Example #1
0
def ofa_specialized(net_id, pretrained=True):
    url_base = 'https://hanlab.mit.edu/files/OnceForAll/ofa_specialized/'
    net_config = json.load(
        open(
            download_url(url_base + net_id + '/net.config',
                         model_dir='.torch/ofa_specialized/%s/' % net_id)))
    if net_config['name'] == ProxylessNASNets.__name__:
        net = ProxylessNASNets.build_from_config(net_config)
    elif net_config['name'] == MobileNetV3.__name__:
        net = MobileNetV3.build_from_config(net_config)
    else:
        raise ValueError('Not supported network type: %s' % net_config['name'])

    image_size = json.load(
        open(
            download_url(url_base + net_id + '/run.config',
                         model_dir='.torch/ofa_specialized/%s/' %
                         net_id)))['image_size']

    if pretrained:
        init = torch.load(download_url(url_base + net_id + '/init',
                                       model_dir='.torch/ofa_specialized/%s/' %
                                       net_id),
                          map_location='cpu')['state_dict']
        net.load_state_dict(init)
    return net, image_size
Example #2
0
    def __init__(self, pkl_path=None, from_scratch=False, dim=(512, 512)):
        self.pkl_path = pkl_path
        self.dim = dim

        if self.pkl_path == None:
            ffhq_pkl = 'stylegan2-ffhq-config-f.pkl'
            ffhq_url = f'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/{ffhq_pkl}'

            empty_pkl = create_model(height=dim[0], width=dim[1])

            if from_scratch:
                self.pkl_path = empty_pkl
            else:
                if not os.path.exists(ffhq_pkl):
                    download_url(ffhq_url, ffhq_pkl)
                self.pkl_path = 'surgery.pkl'
                copy_weights(ffhq_pkl, empty_pkl, self.pkl_path)

        dnnlib.tflib.init_tf()
        print('Loading networks from "%s"...' % self.pkl_path)
        with dnnlib.util.open_url(self.pkl_path) as fp:
            self._G, self._D, self.Gs = pickle.load(fp)
        self.noise_vars = [
            var for name, var in self.Gs.components.synthesis.vars.items()
            if name.startswith('noise')
        ]
def main():
    excluded_venues = get_excluded_venue_ids(download_url(VENUES_URL))

    for row in process(
            download_url(EVENTS_URL),
            excluded_venue_ids=excluded_venues):
        yield row
Example #4
0
def get_data():
    if not os.path.isfile(os.path.join(data_dir, "PROTEINS.zip")):
        #Needs Download
        url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/PROTEINS.zip'
        save_path = os.path.join(data_dir, 'PROTEINS.zip')
        utils.download_url(url, save_path)
    utils.unzip_file(os.path.join(data_dir, "PROTEINS.zip"))
    def download(self):
        if self._check_datafile_exists():
            print('# Found cached data {}, {}'.format(self.images_file,
                                                      self.idx_file))
            return

        if not self._check_downloaded():
            # download files
            url = self.urls[self.name][0]
            filename = self.urls[self.name][1]
            md5 = self.urls[self.name][2]
            fpath = os.path.join(self.root, filename)

            download_url(url, self.root, filename, md5)

            print('# Extracting data {}\n'.format(self.data_down))

            import zipfile
            with zipfile.ZipFile(fpath, 'r') as z:
                z.extractall(self.data_dir)

            os.unlink(fpath)

        # process and save as torch files
        print('# Caching data')

        images = read_image_file(self.data_dir, self.image_ext,
                                 self.lens[self.name])
        points = read_info_file(self.data_dir, self.info_file)
        #refImg = read_interest_file(self.data_dir, self.interest_file)

        print('# Formatting data')
        #print(images.shape, len(points))
        idx = []
        i = 0
        last = len(images)
        min_len = 100

        while i < last:
            point = points[i]
            #print(i, last, point, points[i])
            one_point = []
            while i < last and points[i] == point:
                one_point.append(i)
                i += 1
            #print(len(one_point))
            if min_len > len(one_point):
                min_len = len(one_point)
            idx.append(one_point)

        print("minimal number of patches:", min_len)
        print("Saving to file")

        with open(self.images_file, 'wb') as f:
            torch.save(images, f)
        #print("Idx length:", len(idx))
        with open(self.idx_file, 'wb') as f:
            pkl.dump(idx, f)
        print("Saved")
Example #6
0
def download_unity_launcher(type):
    if not os.path.exists(".tmp"):
        os.mkdir(".tmp")
    download_url = artifactory_download_url
    if 'USE_UBERBUCKET' in os.environ:
        download_url = bokken_artifactory_cache_url
    utils.download_url("%s/tools/unity-launcher/UnityLauncher.%s.zip" % (download_url, type),
                       ".tmp/UnityLauncher.%s.zip" % type)
Example #7
0
    def __init__(self, model_path):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        if not os.path.exists(model_path):
            print('Downloading semantic segmentation model')
            download_url(MODEL_URL, model_path)

        self.model = torch.load(model_path).to(self.device)
Example #8
0
    def download(self):
        """Download the MNIST data if it doesn't exist in processed_folder already."""
        from six.moves import urllib
        import gzip

        if self._check_exists():
            return

        # download files
        try:
            os.makedirs(os.path.join(self.root, self.raw_folder))
            os.makedirs(os.path.join(self.root, self.processed_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        for url in self.urls:
            filename = url.rpartition('/')[2]
            file_path = os.path.join(self.root, self.raw_folder, filename)
            download_url(url, root=os.path.join(self.root, self.raw_folder),
                         filename=filename, md5=None)
            with open(file_path.replace('.gz', ''), 'wb') as out_f, \
                    gzip.GzipFile(file_path) as zip_f:
                out_f.write(zip_f.read())
            os.unlink(file_path)

        # process and save as torch files
        print('Processing...')

        training_set = (
            read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))
        )
        test_set = (
            read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))
        )
        
        indixes_train = np.argwhere(np.apply_along_axis(lambda x : x[0] in self.class_nums, 1, np.array(training_set[1]).reshape(-1, 1)) == 1).reshape(-1)
        indixes_test = np.argwhere(np.apply_along_axis(lambda x : x[0] in self.class_nums, 1, np.array(test_set[1]).reshape(-1, 1)) == 1).reshape(-1)

        if len(self.class_nums) == 2:
            nums = list(self.class_nums)
            training_set[1][indixes_train] = torch.LongTensor(np.where(training_set[1][indixes_train] == nums[0], -1, 1))
            test_set[1][indixes_test] = torch.LongTensor(np.where(test_set[1][indixes_test] == nums[0], -1, 1))
        
        training_set = (training_set[0][indixes_train], training_set[1][indixes_train])
        test_set = (test_set[0][indixes_test], test_set[1][indixes_test])
        
        with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')
Example #9
0
def load_broad_repurposing_hub(path='./data'):
    url = 'https://dataverse.harvard.edu/api/access/datafile/4159648'
    if not os.path.exists(path):
        os.makedirs(path)
    download_path = os.path.join(path, 'broad.tab')
    download_url(url, download_path)
    df = pd.read_csv(download_path, sep='\t')
    df = df.fillna('UNK')
    return df.smiles.values, df.title.values, df.cid.values.astype(str)
Example #10
0
 def download(self, url=None, dest=None):
     if url:
         if not dest:
             dest = os.path.basename(url)
         print "\nDownloading build...\n"
         download_url(url, dest) #see utils module
         self.dest = dest
         return True
     else:
         return False
Example #11
0
def load_IC50_1000_Samples(path='./data', n=100):
    print('Downloading...')
    url = 'https://dataverse.harvard.edu/api/access/datafile/4159681'
    if not os.path.exists(path):
        os.makedirs(path)
    download_path = os.path.join(path, 'IC50_samples.csv')
    download_url(url, download_path)
    df = pd.read_csv(download_path).sample(
        n=n, replace=False).reset_index(drop=True)
    return df['Target Sequence'].values, df['SMILES'].values
def get_covid_data(config, country_iso3, input_dir):
    # download covid data from HDX
    logger.info(f'Getting COVID data for {country_iso3}')
    download_dir = os.path.join(input_dir, COVID_DIR)
    Path(download_dir).mkdir(parents=True, exist_ok=True)
    covid_filename = os.path.join(download_dir,config['filename'])
    try:
        utils.download_url(config['url'], covid_filename)
    except Exception:
        logger.info(f'Cannot get COVID file from for {country_iso3}')
Example #13
0
    def download(self):
        """Download the MNIST data if it doesn't exist in processed_folder already."""
        from six.moves import urllib
        import gzip

        if self._check_exists():
            return

        # download files
        try:
            os.makedirs(os.path.join(self.root, self.raw_folder))
            os.makedirs(os.path.join(self.root, self.processed_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        for url in self.urls:
            filename = url.rpartition('/')[2]
            file_path = os.path.join(self.root, self.raw_folder, filename)
            download_url(url,
                         root=os.path.join(self.root, self.raw_folder),
                         filename=filename,
                         md5=None)
            with open(file_path.replace('.gz', ''), 'wb') as out_f, \
                    gzip.GzipFile(file_path) as zip_f:
                out_f.write(zip_f.read())
            os.unlink(file_path)

        # process and save as torch files
        print('Processing...')

        training_set = (read_image_file(
            os.path.join(self.root, self.raw_folder,
                         'train-images-idx3-ubyte')),
                        read_label_file(
                            os.path.join(self.root, self.raw_folder,
                                         'train-labels-idx1-ubyte')))
        test_set = (read_image_file(
            os.path.join(self.root, self.raw_folder,
                         't10k-images-idx3-ubyte')),
                    read_label_file(
                        os.path.join(self.root, self.raw_folder,
                                     't10k-labels-idx1-ubyte')))
        with open(
                os.path.join(self.root, self.processed_folder,
                             self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(
                os.path.join(self.root, self.processed_folder, self.test_file),
                'wb') as f:
            torch.save(test_set, f)

        print('Done!')
Example #14
0
 def download(self, date=datetime.date.today(), dest=None):
     url = self.getBuildUrl(date)
     if url:
         if not dest:
             dest = os.path.basename(url)
         print "\nDownloading nightly from " + str(date) + "\n"
         download_url(url, dest)
         self.dest = dest
         return True
     else:
         return False
Example #15
0
def load_antiviral_drugs(path='./data', no_cid=False):
    url = 'https://dataverse.harvard.edu/api/access/datafile/4159652'
    if not os.path.exists(path):
        os.makedirs(path)
    download_path = os.path.join(path, 'antiviral_drugs.tab')
    download_url(url, download_path)
    df = pd.read_csv(download_path, sep='\t')
    if no_cid:
        return df.SMILES.values, df[' Name'].values
    else:
        return df.SMILES.values, df[' Name'].values, df['Pubchem CID'].values
Example #16
0
 def download(self, date=datetime.date.today(), dest=None):
     url = self.getBuildUrl(date)
     if url:
         if not dest:
             dest = os.path.basename(url)
         print "\nDownloading nightly...\n"  #TODO: doesn't belong here
         download_url(url, dest)
         self.dest = dest
         return True
     else:
         return False
Example #17
0
 def download(self, date=datetime.date.today(), dest=None):
     url = self.getBuildUrl(date)
     if url:
         if not dest:
             dest = os.path.basename(url)
         print "\nDownloading nightly...\n"  #TODO: doesn't belong here
         download_url(url, dest)
         self.dest = dest
         return True
     else:
         return False
Example #18
0
 def download(self, date=datetime.date.today(), dest=None):
     url = self.getBuildUrl(date)
     if url:
         if not dest:
             dest = os.path.basename(url)
         print "Downloading nightly from %s" % date
         self.remove_lastdest()
         download_url(url, dest)
         self.dest = self.lastdest = dest
         return True
     else:
         return False
Example #19
0
    def download(self):
        import tarfile

        if self._check_integrity():
            print('Files already downloaded and verified')
            return

        download_url(self.url, self.root, self.filename, self.tgz_md5)

        # extract file
        with tarfile.open(os.path.join(self.root, self.filename), "r:gz") as tar:
            tar.extractall(path=self.root)
    def test(self):
        url = "https://www.kaggle.com/account/login?ReturnUrl=/c/dstl-satellite-imagery-feature-detection/download/"
        filename = "sample_submission.csv.zip"
        expected_size = 15246  # 15246 kb
        filepath = os.path.join(os.getcwd(), filename)

        if os.path.exists(filepath):
            os.remove(filepath)

        utils.download_url(url + filename)
        self.assertTrue(os.path.exists(filepath))
        self.assertEqual(expected_size, os.path.getsize(filepath))
Example #21
0
 def download(self, date=datetime.date.today(), dest=None):
     url = self.getBuildUrl(date)
     if url:
         if not dest:
             dest = os.path.basename(url)
         print "Downloading nightly from %s" % date
         if self.lastdest:
             os.remove(self.lastdest)
         download_url(url, dest)
         self.dest = self.lastdest = dest
         return True
     else:
         return False
Example #22
0
    def download(self, date=datetime.date.today(), dest=None):
        url = self.getBuildUrl(date)
        if url:
            if not dest:
                dest = self.get_destination(url, date)
            if not self.persist:
                self.remove_lastdest()

            self.dest = self.lastdest = dest
            download_url(url, dest)
            return True
        else:
            return False
Example #23
0
    def download(self, date=datetime.date.today(), dest=None):
        url = self.getBuildUrl(date)
        if url:
            if not dest:
                dest = self.get_destination(url, date)
            if not self.persist:
                self.remove_lastdest()

            self.dest = self.lastdest = dest
            download_url(url, dest)
            return True
        else:
            return False
Example #24
0
    def download(self):
        import zipfile

        if self._check_integrity():
            print('Files already downloaded and verified')
            return

        filename = self._get_target_folder()
        zip_filename = filename + '.zip'
        url = self.download_url_prefix + '/' + zip_filename
        download_url(url, self.root, zip_filename, self.zips_md5[filename])
        print('Extracting downloaded file: ' + join(self.root, zip_filename))
        with zipfile.ZipFile(join(self.root, zip_filename), 'r') as zip_file:
            zip_file.extractall(self.root)
Example #25
0
 def run(self):
     for ii, filename in enumerate(self.output()):
         filename.makedirs()
         url = f'https://www.futhead.com/18/nations/?page={ii+1}'
         page = utils.download_url(url)
         with open(filename.path, 'wb') as outfile:
             outfile.write(page)
Example #26
0
 def run(self):
     url = 'https://fixturedownload.com/download/fifa-world-cup-2018-RussianStandardTime.csv'
     page = utils.download_url(url)
     self.output().makedirs()
     filename = self.output().path
     with open(filename, 'wb') as outfile:
         outfile.write(page)
Example #27
0
 def run(self):
     url = f'http://www.football-data.co.uk/mmz4281/{self.season}/{self.league}.csv'
     page = utils.download_url(url)
     self.output().makedirs()
     filename = self.output().path
     with open(filename, 'wb') as outfile:
         outfile.write(page)
Example #28
0
 def run(self):
     base_url = 'https://www.fifaindex.com/teams/{}/?type=1'
     for ii, outpath in enumerate(self.output()):
         url = base_url.format(ii + 1)
         page = utils.download_url(url)
         outpath.makedirs()
         with open(outpath.path, 'wb') as f:
             f.write(page)
Example #29
0
 def run(self):
     self.output().makedirs()
     url = 'https://www.oddschecker.com/football/world-cup#outrights'
     page = utils.download_url(url)
     self.output().makedirs()
     filename = self.output().path
     with open(filename, 'wb') as outfile:
         outfile.write(page)
Example #30
0
    def download(self):
        import tarfile

        if self._check_integrity():
            print('Files already downloaded and verified')
            return

        root = self.root
        download_url(self.url, root, self.filename, self.tgz_md5)

        # extract file
        cwd = os.getcwd()
        tar = tarfile.open(os.path.join(root, self.filename), "r:gz")
        os.chdir(root)
        tar.extractall()
        tar.close()
        os.chdir(cwd)
 def get_input_file_path(self):
     if not self.input_file_path:
         remote_path = get_signed_url(self.resource_path, self.bucket)
         self.input_file_path = download_url(
             remote_path, app.config['UPLOAD_FOLDER'],
             target_filename=os.path.basename(self.resource_path),
             timestamp=True)
     return self.input_file_path
Example #32
0
 def locate_links(self):
     """download all torrents and get xpaths of torrent links"""
     links = {}
     loop = 0
     for url in self.urls_torrent_page:
         
         self.logger.info("Buscando enlaces en %s [%d/%d]"%(url, loop, len(self.urls_torrent_page)))
         loop += 1
         
         xpath = XPath(url)
         
         if xpath:
         
             data = {}
             #extrae datos que tambien vienen dentro del torrent
             for k in ['size', 'infohash', 'title']:
                 if k in self.metas:
                     xps = self.metas[k]['all']
                     sorted_xp = reversed(sorted(xps.iteritems(), key=operator.itemgetter(1)))
                     for xp_tuple in sorted_xp:
                         xp = xp_tuple[0]
                         extract = xpath.extract(xp)
                         #~ print "\t", k, extract, xp
                         rt = is_valid_meta(extract, k)
                         if rt:
                             data[k] = extract if k == "title" else rt
                             break
             
             
             
             
             #y se asegura de que coincidan
             for url_torrent, xp in xpath.get_xpath_torrents().items():
                 
                 
                 url_torrent = urljoin(self.base_url, url_torrent)
                 
                 #~ print "@"*22
                 #~ print url_torrent, xp
                 
                 tr = download_url(url_torrent, force = True) #, verbose = True)
                 if tr:
                     #~ print tr
                     #~ print len(tr)
                     try:
                         info = torrent_info(tr)
                     except:
                         continue
                     if is_same_torrent(data, info): #, verbose = True):
                         if not xp in links:
                             links[xp] = 0
                         #~ print xp
                         links[xp] += 1
             
             data[url] = links
         self.metas['links'] = {}        
         self.metas['links']['all'] = {k:v for k,v in links.items() if v>(max(v for v in links.values()) / 3)}
Example #33
0
    def locate_torrent_pages(self):
        #Recorre el sitio en busca de las páginas de fichero

        while len(self.urls_torrent_page
                  ) < self.num_candidates and self.urls_to_see:

            url = self.urls_to_see.pop()

            if not url.startswith("http"):
                url = "%s%s%s" % (self.base_url,
                                  "" if url.startswith("/") else "/", url)

            if url in self.urls_visited:
                continue

            self.logger.info(u"Recorriendo %s" % url)
            html = download_url(url)

            self.add_urls(html)
            if self.is_torrent_page(html, url):

                self.urls_torrent_page[url] = 1
                self.logger.info("torrent page localizada (%d)" %
                                 len(self.urls_torrent_page))

            self.urls_visited.append(url)
            if len(self.urls_visited) % 100 == 0:
                self.logger.info(
                    "%d pages visited, %d torrents page located" %
                    (len(self.urls_visited), len(self.urls_torrent_page)))

            if len(self.urls_visited) > 20000 or (
                    len(self.urls_visited) > 1000
                    and len(self.urls_torrent_page) <
                (len(self.urls_visited) / 1000)):
                return False
            #Con calma para evitar baneos
            time.sleep(3)

        if len(self.urls_torrent_page) < self.num_candidates:
            self.logger.info(len(self.urls_torrent_page))
            self.logger.info(self.urls_to_see)
            return False
            #~ raise Exception("no se han encontrado candidatas")

        if not self.all_torrent_pages_ok():
            self.locate_torrent_pages()

        #Guarda en db
        self.db_conn.torrents.domain.update(
            {"_id": self.get_id()},
            {"$set": {
                "tp": self.urls_torrent_page.keys()
            }},
            upsert=True)

        return self.urls_torrent_page
Example #34
0
 def get_input_file_path(self):
     if not self.input_file_path:
         remote_path = get_signed_url(self.resource_path, self.bucket)
         self.input_file_path = download_url(
             remote_path,
             app.config['UPLOAD_FOLDER'],
             target_filename=os.path.basename(self.resource_path),
             timestamp=True)
     return self.input_file_path
Example #35
0
    def download(self):
        """Download the MNIST data if it doesn't exist in processed_folder already."""

        if self._check_exists():
            return

        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)

        # download files
        for url in self.urls:
            filename = url.rpartition('/')[2]
            file_path = os.path.join(self.raw_folder, filename)
            download_url(url,
                         root=self.raw_folder,
                         filename=filename,
                         md5=None)
            self.extract_gzip(gzip_path=file_path, remove_finished=True)
Example #36
0
 def run(self):
     fifa_season = utils.translate_season_to_fifa(self.season)
     league_int = utils.translate_league(self.league)
     url = f'https://www.fifaindex.com/teams/{fifa_season}_{self.match_day}/?league={league_int}'
     outpath = self.output()
     page = utils.download_url(url)
     outpath.makedirs()
     with open(outpath.path, 'wb') as f:
         f.write(page)
Example #37
0
File: sgan.py Project: fayiz7/sgan
    def __init__(self,
                 pkl_path=None,
                 from_scratch=False,
                 dim=(512, 512),
                 from_dir=None,
                 cond=False,
                 label_size=0):
        self.pkl_path = pkl_path
        self.dim = dim

        if self.pkl_path is None and from_dir is None:
            ffhq_pkl = 'stylegan2-ffhq-config-f.pkl'
            ffhq_url = f'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/{ffhq_pkl}'

            empty_pkl = create_model(height=dim[0],
                                     width=dim[1],
                                     cond=cond,
                                     label_size=label_size)

            if from_scratch:
                self.pkl_path = empty_pkl
            else:
                if not os.path.exists(ffhq_pkl):
                    download_url(ffhq_url, ffhq_pkl)
                self.pkl_path = 'surgery.pkl'
                copy_weights(ffhq_pkl, empty_pkl, self.pkl_path)

        if from_dir:
            curr_best = 0
            for pkl_file in glob.glob(f'{from_dir}/*.pkl'):
                ckpt_number = int(pkl_file.split('-')[-1][:-4])
                if curr_best < ckpt_number:
                    curr_best = ckpt_number
                    self.pkl_path = pkl_file

        dnnlib.tflib.init_tf()
        print('Loading networks from "%s"...' % self.pkl_path)
        with dnnlib.util.open_url(self.pkl_path) as fp:
            self._G, self._D, self.Gs = pickle.load(fp)
        self.noise_vars = [
            var for name, var in self.Gs.components.synthesis.vars.items()
            if name.startswith('noise')
        ]
Example #38
0
    def download(self) -> None:
        """Download the QMNIST data if it doesn't exist in processed_folder already.
           Note that we only download what has been asked for (argument 'what').
        """
        if self._check_exists():
            return
        os.makedirs(self.raw_folder, exist_ok=True)
        os.makedirs(self.processed_folder, exist_ok=True)
        split = self.resources[self.subsets[self.what]]
        files = []

        # download data files if not already there
        for url, md5 in split:
            filename = url.rpartition('/')[2]
            file_path = os.path.join(self.raw_folder, filename)
            if not os.path.isfile(file_path):
                download_url(url,
                             root=self.raw_folder,
                             filename=filename,
                             md5=md5)
            files.append(file_path)

        # process and save as torch files
        print('Processing...')
        data = read_sn3_pascalvincent_tensor(files[0])
        assert (data.dtype == torch.uint8)
        assert (data.ndimension() == 3)
        targets = read_sn3_pascalvincent_tensor(files[1]).long()
        assert (targets.ndimension() == 2)
        if self.what == 'test10k':
            data = data[0:10000, :, :].clone()
            targets = targets[0:10000, :].clone()
        if self.what == 'test50k':
            data = data[10000:, :, :].clone()
            targets = targets[10000:, :].clone()
        with open(os.path.join(self.processed_folder, self.data_file),
                  'wb') as f:
            torch.save((data, targets), f)
def process(html_fobj):
    for row in get_all_listings(html_fobj):
        event_page = download_url(row['url'])

        if not is_single_event(event_page):
            L.info("Ignoring repeated event '{}'".format(row['headline']))
            continue

        event_page.seek(0)
        if is_child_event(event_page):
            L.info("Ignoring children's film '{}'".format(row['headline']))
            continue

        # TODO: row['description'] = parse_description(event_page)
        yield row
Example #40
0
 def locate_torrent_pages(self):
     #Recorre el sitio en busca de las páginas de fichero
     
     while len(self.urls_torrent_page) < self.num_candidates and self.urls_to_see:
         
         url = self.urls_to_see.pop()
         
         if not url.startswith("http"):
             url = "%s%s%s"%(self.base_url, "" if url.startswith("/") else "/", url)
         
         if url in self.urls_visited:
             continue
         
         
         self.logger.info(u"Recorriendo %s"%url)
         html = download_url(url)
         
         
         self.add_urls(html)
         if self.is_torrent_page(html, url):
             
             self.urls_torrent_page[url] = 1
             self.logger.info("torrent page localizada (%d)"%len(self.urls_torrent_page))
             
         self.urls_visited.append(url)
         if len(self.urls_visited) % 100 == 0:
             self.logger.info("%d pages visited, %d torrents page located" % (len(self.urls_visited), len(self.urls_torrent_page)))
             
         if len(self.urls_visited) > 20000 or (len(self.urls_visited) > 1000 and len(self.urls_torrent_page) < (len(self.urls_visited)/1000)): 
             return False
         #Con calma para evitar baneos
         time.sleep(3)
         
         
     if len(self.urls_torrent_page) < self.num_candidates:
         self.logger.info(len(self.urls_torrent_page))
         self.logger.info(self.urls_to_see)
         return False
         #~ raise Exception("no se han encontrado candidatas")
         
     if not self.all_torrent_pages_ok():
         self.locate_torrent_pages()
     
     
     #Guarda en db
     self.db_conn.torrents.domain.update({"_id":self.get_id()},{"$set":{"tp":self.urls_torrent_page.keys()}},upsert=True)
     
     return self.urls_torrent_page
Example #41
0
    def download_file(self, mongodb_version, mongodb_edition,
                      destination=None):

        destination or os.getcwd()

        url = self.get_download_url(mongodb_version, mongodb_edition)

        response = urllib.urlopen(url)

        if response.code == 404:
            raise FileNotInRepoError("File not found in repo")
        if response.getcode() != 200:
            msg = ("Unable to download from url '%s' (response code '%s'). "
                   "It could be that version '%s' you specified does not exist."
                   " Please double check the version you provide" %
                   (url, response.getcode(), mongodb_version))
            raise MongoctlException(msg)

        return download_url(url, destination,
                            show_errors=not is_interactive_mode())
Example #42
0
 def setupTests(self):
     zippedTests = download_url(getTestUrl(),dest=str(os.path.join(self.shellCacheDir,"tests.zip")))
     unzip(self.testDir,zippedTests)
Example #43
0
    def get_metas(self):
        self.logger.info("Busqueda de metadatos")
        
        data = {}
        l = 0
        
        if not self.urls_torrent_page:
            return False
        
        for url in self.urls_torrent_page:
            
            self.logger.info("[%d/%d]Extrayendo cadenas de %s"%( l, len(self.urls_torrent_page), url))
            l += 1
            try:
                html = download_url(url)
                doc = BeautifulSoup(html)
            except TypeError:
                self.logger.warning("No se ha podido cargar %s"%url)
                continue
                
            if not self.is_torrent_page(html, url):
                del self.urls_torrent_page[url]
                self.logger.warning("No es torrent page")
                return None

            if doc is None or doc.body is None:
                del self.urls_torrent_page[url]
                self.logger.warning("El doc obtenido no es valido")
                return None
                    
            strings = []
            for string in doc.body.stripped_strings:
                strings.append(string)
            data[url] = strings
            
        map_str = []
        for pos in xrange(0, len(strings)): 
            self.logger.info("Analizando [%d/%d]"%(pos, len(strings)))
            equal = True
            first = True
            previous = None
            for url in data:
                if previous is None:
                    previous = url
                else:
                    try:
                        if len(data[url]) < pos or len(data[previous]) < pos  or (data[url][pos] != data[previous][pos]):
                            if first:
                                first = False
                                continue
                            equal = False
                            break;
                    except IndexError:
                        if first:
                            first = False
                            continue
                        equal = False
                        break; 
            
                
            map_str.append(equal)
        
        
        #~ data = {}
        rt = {}
        duplicated = []
        
        self.logger.info("Busqueda atributos")
        
        #busqueda de atributos para cada url
        for url in self.urls_torrent_page:
            
            #~ self.logger.info("Procesando cadenas %s"%url)
            
            pos = 0
            last_equal = True
            
            metadata = {"infohash" : None,
                        "size" : None,
                        "description" : None,
                        "title" : None, 
                        "category" : None,
                        "tags" : None,
                        "quality" : None,
            
                        "genre" : None,

                        #~ "series" : None,
                        "season" : None,
                        "episode" : None,
            
                        "language" : None}
            
            
            pos = 0
            last_equal = True
            
            next_is_description = False
            
            doc = BeautifulSoup(download_url(url))
            try:
                h1 = doc("h1")[0].stripped_strings.next()
                if len(doc("h1"))> 0 and is_title(h1, url, full = True):
                    metadata['title'] = h1
                else:
                    h2 = doc("h2")[0].stripped_strings.next()
                    if doc("h2") and is_title(h2, url, full  = True):
                        metadata['title'] = h2
            except IndexError:
                pass
                    
            
            #~ print metadata['title']
            
            meta = {}
            for equal in map_str:
                #Lo que parecen metadatos que ofrece la página
                if not equal and last_equal:
                    #~ print pos, "[%s]"%data[self.urls_torrent_page[0]][pos-1], data[self.urls_torrent_page[0]][pos]
                    if pos>0:
                        prev = data[url][pos-1]
                        token = data[url][pos]
                        if len(prev) < 20 and not prev.isnumeric() and len(prev) > 2 and prev.count(" ")<2:
                            meta[prev.replace(":","").lower()]  = token
                pos += 1
                last_equal = equal
                
            for m in metadata:
                if m in meta:
                    metadata[m] = meta[m]
            
            pos = 0
            last_equal = True
            
            
            #~ print url
            #~ print metadata
            #~ print "*"*44
            #Busqueda en bruto
            already_title = False
            
            search_tags = True
            
            for equal in map_str:
                
                if pos < len(data[url]):
                    token = data[url][pos]
                
                #~ print token
                #Hasta que no aparece el titulo no empieza a buscar
                if not already_title:
                    if not(len(token) > 5 and is_title(token.lower(), url, full = True)):
                        pos += 1
                        continue
                    already_title = True
                
                #~ print "*****************"
                
                #La descripción suele estar al final
                ending = ["comment", "related", "similar"]

                if any([w in token.lower() for w in ending]) and pos > len(data[url]) * 0.4:
                    #~ print pos, len(data[url])
                    #nada interesante despues de los comentarios o archivos relacionados
                    #~ exit()
                    #~ print token
                    #~ print "*********ENDING***************"
                    break
                    
                if search_tags:
                    if any([w in token.lower() for w in ending]):
                        search_tags = False
                
                if pos > (len(map_str) * 0.75):
                    
                    #Esto es el final de la página y ya no hay nada interesante
                    #~ print token
                    #~ print "*********ENDING LARGE***************"
                    break
                
                if is_script(token):
                    pos += 1
                    last_equal = equal
                    continue
                if not equal:
                    if search_tags:
                        tag = is_tag(token)
                        if tag:
                            if metadata['tags'] is None or isinstance(metadata['tags'], basestring):
                                metadata['tags'] = {}
                            
                            if not tag in metadata['tags']:
                                try:
                                    metadata['tags'][tag] = []
                                except:
                                    print metadata
                                    print metadata['tags']
                                    print tag
                                    raise
                            if not token in metadata['tags'][tag]:
                                metadata['tags'][tag].append(token)
                    
                    
                    if metadata['description'] is None or len(metadata['description']) < len(token):
                        #busca descripciones cortas salvo que las haya más largas
                        if len(token) > 100 or next_is_description:
                            #~ print url
                            #~ print "--------------"
                            #~ print "token",token
                            #~ print "desc_candidate", desc_candidate
                            if not equal and not is_script(token) and is_description(token):
                                metadata['description'] = token
                                #~ print url
                                #~ print "metadata['description']", metadata['description']
                            next_is_description = False
                            
                    
                    if "description" in token:
                        next_is_description = True
                        
                    if metadata['title'] is None:
                        if len(token) > 5 and is_title(token, url):
                            metadata['title'] = token

                    if metadata['season'] is None or metadata['episode'] is None:
                        if len(token) > 3:
                            se = is_season_episode(token)
                            if se:
                                metadata['season'] = {"token" : token, "value" : se['s']}
                                metadata['episode'] = {"token" : token, "value" : se['e']}
                                

                    if metadata['size'] is None:
                        if len(token) > 2:
                            z = is_size(token)
                            if z:
                                metadata['size'] = {"token" : token, "value":z}

                    if metadata['infohash'] is None:
                        if "hash" in token or len(token) == 40:
                            metadata['infohash'] = extract_infohash(token)
                    
                    
                    #~ if metadata['category'] is None:
                        #~ if is_category(token):
                            #~ print url
                            #~ print "\t\t\t", token
                            #~ print get_xpath_from_soup_object(token)
                            #~ metadata['category'] = token
                    
                    if metadata['language'] is None:
                        if is_language(token):
                            metadata['language'] = token
                
                pos += 1
                last_equal = equal
            
            #~ print
            #~ print
            #~ print url
            #~ print metadata
            #~ print "--"
            #~ print url
            #~ if not metadata['tags'] is None:
                #~ print metadata['tags']
            
            #~ self.logger.info("Extrayendo xpaths %s"%url)
            xpath = XPath(url)
            if not xpath:
                del self.urls_torrent_page[xpath]
                self.logger.warning("No se pueden xpathear %s" % url)
                return None
            
            
            _metadata = {}
            
            
            for m, v in metadata.items():
                
                #~ print m, v
                
                if v:
                        
                    if type(v) == type({}):
                        if not "value" in v:
                            value = ",".join([",".join(keywords) for keywords in v.values()])
                            token = value.split(",")
                            #~ print token
                        else:
                            value = v['value']
                            token = v['token']
                    else:
                        value = v
                        token = v
                    
                    def extract_token(token, value):
                        #~ print "extrayendo %s - %s"%(token, value)
                        try:
                            xp = xpath.get_xpath(u(token)) 
                        except UnicodeDecodeError:
                            #~ print "unicode"
                            return False
                        
                        if xp is None:
                            #~ print "xp"
                            return False
                        #~ print "XP", xp
                        extract = xpath.extract(xp)
                        if not extract:
                            #~ print "no extract", token, xp 
                            return False
                        
                        #~ print ".."
                        #~ print element_2_str(extract)
                        
                        #~ print "TOKEN-EXTRACT", token.strip(), extract.strip()
                        
                        if len(extract) > 0 or not token.strip() == extract.strip():
                            #~ print "[%s][%s]"%(token, extract[0])
                            ok = True
                            if xpath.last_expansive:
                                ok = False
                                try:
                                    if token.strip() in element_2_str(extract):
                                        #es correcto, probablemente la descripcion
                                        ok = True
                                    else:
                                        #~ self.logger.warning("else last_expansive")
                                        return False
                                except:
                                    return False

                            if not ok:
                                if not token.strip() in extract.strip():
                                    print("No coincide %s[%s] para %s"%(xp, extract, token))
                                    self.logger.error("No coincide %s[%s] para %s"%(xp, extract, token))
                                    raise Exception("Incoherencia xpath")
                                
                                #~ self.logger.warning("No se puede extrar el xpath de %s en %s"%(token, url)    )
                                return False
                            
                        
                        
                        id_m = m
                        if m == "tags":
                            tg = is_tag(value)
                            if tg:
                                id_m = tg.split("_")[0]
                        
                        if id_m in _metadata and _metadata[id_m]['xpath'] != xp and id_m != "category":
                            duplicated.append(id_m)
                        
                        #Para el metadata "language" no se guardan xpath de enlaces ya que suelen ser selectores de idioma de la página
                        if id_m == "language" and "/a/" in xp:
                            return False 
                        
                        #h1 solo para el title
                        if id_m != "title" and "/h1" in xp:
                            return False
                        
                        if "'tab-main'" in xp and id_m == "subcategory":
                            print
                            print
                            print
                            print url
                            print id_m
                            print _metadata
                            print value
                            print xp
                            exit()
                            
                        
                        #No se guarda nada que cuelgue de comentarios, script o style
                        invalid = ["comment", "script", "style", "select"]
                        
                        
                        if not any([w in xp for w in invalid]) :
                            _metadata[id_m] = {"value" : value, "xpath" : xp}
                            
                        #~ print "_METADATA", _metadata
                    
                    if type(token) == type([]):
                        for t in token:
                            #~ print "\t"+t
                            extract_token(t, t)
                    else:
                        extract_token(token, value)
                
            #evita confundir description con title
            if "description" in _metadata and "title" in _metadata:
                if _metadata['title']['xpath'] == _metadata['description']['xpath']:
                    del _metadata['description']
            
            #~ print "%s:%s"%(url, _metadata)
            #~ exit()
            for d in duplicated:
                if d in _metadata:
                    del _metadata[d]
            
            duplicated = []
            
            rt[url] = { "metadata" : _metadata, "meta" : meta}
            
            #~ pprint(rt[url]['metadata'])
            #~ exit()
            #~ print
            #~ print
            #~ print
            #~ print "**********************"
            #~ print url
            #~ print rt[url]['metadata']
            #~ print "**********************"
            

        #~ for url, d in rt.items():
            #~ print url
            #~ for k,v in d['metadata'].items():
                #~ if not v is None:
                    #~ print "%s: {%s:%s}"%(k, v['value'], v['xpath'])
                #~ 
            #~ print
            #~ for k, v in d['meta'].items():
                #~ if not v is None:
                    #~ print "[%s] "%(k)
            #~ print 
            #~ print "**************"
        
        
        metas_ocurrences = {}
        for url, d in rt.items():
            #~ print url
            for k,v in d['metadata'].items():
                #~ print k, v
                if not k in metas_ocurrences:
                    metas_ocurrences[k] = {}
                value = v['xpath']
                if not value in metas_ocurrences[k]:
                    metas_ocurrences[k][value] = 0
                metas_ocurrences[k][value] += 1
        
        #~ print "ocurrences"
        #~ print metas_ocurrences
        
        
        metas = {}
        for m, xpaths in metas_ocurrences.items():
            for xpath, count in xpaths.items():
                #~ if count > (self.num_candidates / 10):
                #~ if count > 0:
                if not m in metas:
                    metas[m] = {}
                    metas[m]["all"] = {}
                if not xpath in metas[m]["all"]:
                    metas[m]["all"][xpath] = count
                #~ metas[m]["all"] = "(%d)%s"%(count, xpath)
                
        #busca en los metas "debiles" y los elimina si no está seguro de que son correctos
        
        for weak in self.weak_metas:
            if weak in metas:
                ok = False
                sum_counts = sum(metas[weak]['all'].values())
                for count in metas[weak]['all'].values():
                    if count > (sum_counts / 3):
                        ok = True
                if not ok:
                    del metas[weak]
        
                    
        
        
        self.metas = metas
                  
        return True
Example #44
0
def upload():
    # Get priority
    priority = int(request.form.get('priority', PRIORITY.medium))
    if priority not in PRIORITY.get_values():
        priority = PRIORITY.medium

    # Get output formats
    output_formats = request.form.get('output-formats', '')
    output_formats = list(set(
        filter(
            lambda format: format in app.config['ALLOWED_EXTENSIONS'],
            output_formats.split(';')
        )
    ))
    if not output_formats:
        return jsonify({'Error': 'Must provide valid output formats'}), 400

    # Get file (either directly or via URL)
    file = request.files.get('file')
    allowed_extensions = app.config['ALLOWED_EXTENSIONS']

    if file:
        if allowed_filename(file.filename, allowed_extensions):
            filename = secure_filename(file.filename).strip()[-FILE_NAME_LIMIT]
            local_path = os.path.join(app.config['UPLOAD_FOLDER'],
                                      timestamp_filename(filename))
            file.save(local_path)
        else:
            return jsonify({'Error': 'File format not allowed'}), 400
    else:
        fileURL = request.form.get('fileURL')
        if fileURL:
            filename = get_filename_from_url(fileURL)

            try:
                local_path = download_url(
                    fileURL, app.config['UPLOAD_FOLDER'], timestamp=True)
            except FileAccessDenied as fad:
                return jsonify({
                    'status': 'error',
                    'code': fad.status_code,
                    'message': fad.message
                }), 500

        else:
            return jsonify({'status': 'error',
                            'message': 'Unable to decode uploaded file'}), 500

    # Upload to remote and remove file from local
    remote_destination = os.path.join(app.config['REMOTE_INPUT_FOLDER'],
                                      get_uuid(), filename)
    upload_to_remote(remote_destination, local_path)
    os.remove(local_path)

    # Register the file for conversions and return docIds
    docIds = Conversion.register_file(filename, remote_destination,
                                      g.user, output_formats, priority)

    # Call request fetcher
    request_fetcher.delay()

    return jsonify({'status': STATUS.introduced, 'doc_ids': docIds})
Example #45
0
    def get_image(self, mode = 0):
        #-----------------
        #Valores para mode. Se utiliza para considerar una imagen o no candidata por el tamaño
        #0 -> width y height > 100
        #1 -> width y height >= 100
        #2 -> width o height > 100
        #3 -> width o height >= 100
        
        if mode>3: return None
        
        
        data = {}
        loop = 0
        
        blacklist = ["avatar", "promo", "category", "categories", "user", "ads"]
        
        domain = self.get_id()
        images = {}
        
        for url in self.urls_torrent_page:
            self.logger.info("Analizando %s en busca de imagenes [%d/%d]"%(url, loop, len(self.urls_torrent_page)))
            loop += 1
            #~ if loop > 10:
                #~ break
            doc = BeautifulSoup(download_url(url))
            imgs = []

            xpath = XPath(url)
            title = None
            
            if xpath:
            
                if 'title' in self.metas:
                    for xp in self.metas['title']['all']:
                        extract = xpath.extract(xp)
                        rt = is_title(extract, url)
                        if rt:
                            title = extract
                                
                                
            for img in doc("img"):
                
                #~ print title, img.get("alt"), img.get("src")
                
                
                if img.get("alt") and title and title in img.get("alt"):
                    images[url] = {"img":img.get("src"), "xpath":xpath.get_xpath_img(clean_url_img(img.get("src"), domain))}
                    self.logger.info("Imagen localizada %s"%images[url])
                    #the one
                    imgs = [img.get("src")]
                    break
                
                imgs.append(img.get("src"))
                
            
            
            data[url] = imgs
                
        
        commons = []
        
        
        
        
        for url in data:
            for img in data[url]:
                if not img in images:
                    images[img] = 0
                images[img] += 1
        
        for img, count in images.items():
            if count>(len(self.urls_torrent_page) / 2):
                commons.append(img)
        
        
        images = {}
        loop = 0
        for url in self.urls_torrent_page:
            self.logger.info("Buscando la principal en %s [%d/%d]"%(url, loop, len(self.urls_torrent_page)))
            loop += 1
            
            #~ if loop > 10:
                #~ break
            img_candidates = {}
            
            for img in data[url]:
                if not img in commons and not ".." in img and not any([w in img.lower() for w in blacklist]):
                    try:
                        self.logger.info("Salvando temporal de %s" % img)
                        im = Image.open(save_tmp(img, domain))
                        width, height = im.size
                        print img, width, height
                        #ver comentario al principio de la funcion para el comportamiento de mode
                        if mode == 0:
                            if width > 100 and height > 100:
                                img_candidates[img] = "%sx%s"%(width, height)
                        if mode == 1:
                            if width >= 100 and height >= 100:
                                img_candidates[img] = "%sx%s"%(width, height)    
                        if mode == 2:
                            if width > 100 or height > 100:
                                img_candidates[img] = "%sx%s"%(width, height)
                        if mode == 3:
                            if width >= 100 or height >= 100:
                                img_candidates[img] = "%sx%s"%(width, height)    
                        
                       

                    except IOError as e:
                        self.logger.error("IOError %s: %s "%(e, img))
                        pass
                        
            
            #~ print img_candidates
             #Si se repiten tamaño se anulan
            def size_equal(s1, s2):
                ss1 = s1.split("x")
                ss2 = s2.split("x")
                return abs(int(ss1[0])-int(ss2[0])) < 3 and abs(int(ss1[1])-int(ss2[1])) < 3
            
            
            #~ print "*********"
            #~ pprint(img_candidates)
            
            no_candidates = []
            for img in img_candidates:
                size = img_candidates[img]
                for img2 in img_candidates:
                    if img != img2 and size_equal( size, img_candidates[img2]):
                        no_candidates.append(img)
            
            
            for no in no_candidates:
                if no in img_candidates:
                    del img_candidates[no]
            
            
            #~ pprint.pprint(img_candidates)
            no_candidates = []
            if len(img_candidates) > 1:
                #Intenta excluir las que no son del propio dominio
                for img in img_candidates:
                    if not domain in img:
                        no_candidates.append(img)
            
            
            
            for no in no_candidates:
                if no in img_candidates:
                    del img_candidates[no]

            
    
            images[url] = None    
            if len(img_candidates) == 1:
                #Imagen obtenida
                xpath = XPath(url)
                img = clean_url_img(img_candidates.keys()[0], domain)
                
                
                images[url] = {"img":img, "xpath":xpath.get_xpath_img(img)}
                
                self.logger.info("Imagen localizada %s"%images[url])
                
            
        xpaths = {}
        #Recuenta todos los xpaths que han aparecido
        for img, v in images.items():
            if v is None:
                continue
            xpath = v['xpath']
            
            if xpath:
                #ignora logos
                if 'logo' in xpath:
                    continue
                
                if not xpath in xpaths:
                    xpaths[xpath] = 0
                xpaths[xpath] += 1
        
        
        #Se queda con el más comun si aparece un minimo de veces
        current_xpath = None
        max_count = 0
        for xpath, count in xpaths.items():
            #~ print count, max_count
            if count > max_count:
                max_count = count
                current_xpath = xpath
        
        
        #~ print current_xpath
        #~ print max_count , (len(self.urls_torrent_page) / 10)
        #~ if max_count >= (len(self.urls_torrent_page) / 10):
        if max_count >= 1:
            #Un xpath suficiente
            self.metas['image'] = {"candidate":current_xpath, "all":xpaths}
            return current_xpath
        
        
        return self.get_image(mode + 1)
Example #46
0
    def get_category(self):
        
        
        
        if 'category' in self.metas and 'all' in self.metas['category'] and sum(v for v in self.metas['category']['all'].values()) > (len(self.urls_torrent_page)/3):
            print "****************************"
            print self.metas['category']
            print "ya tiene"
            return True
            
        if not self.urls_torrent_page:
            return False
        
        blacklist = ["user", "download", ".torrent","magnet", "api", "about", "privacy", "register", "contact", "recover"
                , "latest", "popular", "request", "rss", "faq"]
        
        data = {}
        for url in self.urls_torrent_page:
            #~ print url
            doc = BeautifulSoup(download_url(url))
            #~ print "cargando doc", url
            links = []
            
            #Busca enlaces que parezcan categorias
            for link in doc("a"):
                href = link.get("href")
                if href is None:
                    continue
                if href.startswith("/") and not href.startswith("//"): href = "/".join(url.split("/")[:3]) + href
                if not any([w in href.lower() for w in blacklist]) and href.startswith("/".join(url.split("/")[:3])) and href != "/".join(url.split("/")[:3]):
                    if not href in links and is_category(link.string):
                        xp = get_xpath_from_soup_object(link)
                        links.append((href, link.string, xp))
                        
            data[url] = links   

        map_links = {}
        for url, links in data.items():
            
            #~ print url, links
            pos = 0
             
            for link in links:
                #~ print link
                if not pos in map_links.keys():
                    map_links[pos] = []
                    
                _id = "%s|||%s|||%s" % link 
                if not _id in map_links[pos]:
                    map_links[pos].append(_id)
                    
                
                pos += 1
                #~ print "\t:%s, %s, %s"%link
        
        
        
        xp = None
        for pos, xpaths in map_links.items():
             #~ print pos, len(xpaths), xpaths
             if len(xpaths) > 1:
                xp = xpaths[0].split("|||")[-1]
                break
        
        
        if not xp:
            xpath_cat = {}
            
            for url in self.urls_torrent_page:
                #Lo intenta buscando breadcrumb
                _doc = BeautifulSoup(download_url(url))
                _xpath = XPath(url)
                
                next_cat = False
                for string in _doc.stripped_strings:
                    
                    if next_cat:
                        if is_category(string):
                            _xp =  _xpath.get_xpath(string)
                            if not _xp in xpath_cat:
                                 xpath_cat[_xp] = 0
                            xpath_cat[_xp] += 1
                            
                            
                        next_cat = False
                    if u">" in string and not u"<" in string or u"»" in string and not is_script(string):
                        next_cat = True
                
            for _xp in xpath_cat:
                if xpath_cat[_xp] > (len(self.urls_torrent_page) * 0.75):
                    xp = _xp
            
            
            
        if not xp:
            #Lo intenta en la url
            pos = 0
            for url_part in url.split("/"):
                if (is_category(url_part)):
                    #~ print "category(url)", url_part
                    xp = "@url[%d]"%pos
                pos += 1
        
        
        
        #~ print "..........."
        #~ print url.split("/")[2]
        #~ print url
        
        
        #~ print xp
        _all = {} 
        for url in self.urls_torrent_page:
            if not xp is None:
                if "@url" in xp:
                    if not xp in _all:
                        _all[xp] = 0
                    _all[xp] += 1
                    #~ self.metas['category'] = xp
                    #~ return xp
                try:
                    extract = XPath(url).extract(xp)
                except:
                    continue
                
                
                if len(extract) > 0 and is_category(extract):
                    if not xp in _all:
                        _all[xp] = 0
                    _all[xp] += 1
                    #~ self.metas['category'] = xp
                    #~ print self.metas['category']
                    #~ exit()
                    #~ return xp
        if not "category" in self.metas:
            self.metas['category'] = {}
        if not "all" in self.metas['category']:
            self.metas['category']['all'] = {}
            
        for a in _all:
            #Si ha encontrado la mayoria con este metodo elimina el resto
            if _all[a] > (len(self.urls_torrent_page) * 0.6):
                self.metas['category']['all'] = {}
                
            if not a in self.metas['category']['all']:
                self.metas['category']['all'][a] = 0
            
            self.metas['category']['all'][a] += _all[a]
        #Si hay alguno con @url solo vale ese
        for a in self.metas['category']['all']:
            if "@url" in a:
                count = self.metas['category']['all'][a]
                self.metas['category']['all'] = {}
                self.metas['category']['all'][a] = count
        
        return _all
def main():
    for row in process(download_url(URL)):
        yield row
Example #48
0
    def bisectRecurse(self, testcondition=None, args_for_condition=[]):
        #Recursively build, run, and prompt
        verdict = ""
        current_revision = captureStdout(self.hgPrefix+["id","-i"])

        if self.remote:
            print "on current revision "+current_revision
            print "This would ask for a remote changeset, but it's not implemented yet."
            #TODO:
            #Remote bisection!
            #Step 1. Check if revision is in the archive
            #Step 2. If revision is not in the archive, set remote=False and continue (it will build and bisect that revision)
            #if not check_archived:
            #    set remote false and continue
            #else:
            #Step 3. If the revision is in the archive, download it and its corresponding tests
                #STEP3
                #1. Extract tests into some directory
                #2. Extract Nightly.app into "tests"
                #MozInstaller(src=, dest="", dest_app="Nightly.app")
                #3. run the following:
                #test_command = ['python', 'mochitest/runtests.py', '--appname=./Nightly.app/Contents/MacOS/firefox-bin', '--utility-path=bin', '--extra-profile-file=bin/plugins', '--certificate-path=certs', '--autorun', '--close-when-done', '--console-level=INFO', '--test-path=test_name']
                #output = captureStdout(test_command, ignoreStderr=True)
                #set verdict based on output
                #python mochitest/runtests.py --appname=./Nightly.app/Contents/MacOS/firefox-bin --utility-path=bin --extra-profile-file=bin/plugins --certificate-path=certs --autorun --close-when-done --console-level=INFO --test-path=test_name

                #example test name: Harness_sanity/test_sanityException.html
                #Step 4. Run and run test to get verdict
                #Step 5. Set verdict

        elif self.tryPusher:
            try:
                caller = BuildCaller(host=self.tryhost, port=int(self.tryport), data=current_revision)
                print "Getting revision "+current_revision+"..."
            except:
                print "Failed to connect to trypusher. Make sure your settings are correct and that the trypusher server was started."
                exit()
            response = caller.getChangeset()
            print "Waiting on Mozilla Pulse for revision " + response + "..."
            url = caller.getURLResponse(response)
            print "the base is " +url_base(url)
            #Download it here
            #1. Download from url, extract to same place as tests
            #2. Run test or start browser.
            binary_path =  os.path.join(self.binaryDir,url_base(url))
            downloaded_binary = download_url(url, dest=str(binary_path))
            MozInstaller(src=str(binary_path), dest=str(self.testDir), dest_app="Nightly.app")
            #now nightly is installed in
            if sys.platform == "darwin":
                binary_path = os.path.join(self.testDir,"Nightly.app")
                runner = FirefoxRunner(binary=os.path.join(binary_path,"Contents","MacOS")+"/firefox-bin")
            elif sys.platform == "linux2":
                binary_path = os.path.join(self.testDir,"firefox")
                runner = FirefoxRunner(binary=binary_path)
            elif sys.platform == "win32" or sys.platform == "cygwin":
                binary_path = os.path.join(self.testDir,"firefox.exe")
                runner = FirefoxRunner(binary=binary_path)
            else:
                print "Your platform is not currently supported."
                quit()

            dest = runner.start()
            if not dest:
                print "Failed to start the downloaded binary"
                verdict == "skip"
            runner.wait()
            if verdict == "skip":
                pass
            elif testcondition!=None:
                #Support condition scripts where arg0 is the directory with the binary and tests
                args_to_pass = [self.testDir] + args_for_condition

                if hasattr(testcondition, "init"):
                    testcondition.init(args_to_pass)

                #TODO: refactor to use directories with revision numbers
                #8.2.11 - revision number can now be found in current_revision variable
                tmpdir = tempfile.mkdtemp()
                verdict = testcondition.interesting(args_to_pass,tmpdir)

                #Allow user to return true/false or bad/good
                if verdict != "bad" and verdict != "good":
                    verdict = "bad" if verdict else "good"
        else:
            try:
                self.build()
            except Exception:
                print "This build failed!"
                verdict = "skip"

            if verdict == "skip":
                pass
            elif testcondition==None:
                #Not using a test, interactive bisect begin!
                self.run()
            else:
                #Using Jesse's idea: import any testing script and run it as the truth condition
                args_to_pass = [self.objdir] + args_for_condition

                if hasattr(testcondition, "init"):
                    testcondition.init(args_to_pass)

                #TODO: refactor to use directories with revision numbers
                #8.2.11 - revision number can now be found in current_revision variable
                tmpdir = tempfile.mkdtemp()
                verdict = testcondition.interesting(args_to_pass,tmpdir)

                #Allow user to return true/false or bad/good
                if verdict != "bad" and verdict != "good":
                    verdict = "bad" if verdict else "good"

        while verdict not in ["good", "bad", "skip"]:
            verdict = raw_input("Was this commit good or bad? (type 'good', 'bad', or 'skip'): ")
            if verdict == 'g':
                verdict = "good"
            if verdict == 'b':
                verdict = "bad"
            if verdict == 's':
                verdict = "skip"

        # do hg bisect --good, --bad, or --skip
        verdictCommand = self.hgPrefix+["bisect","--"+verdict]
        print " ".join(verdictCommand)
        retval = captureStdout(verdictCommand)

        string_to_parse = str(retval)
        print string_to_parse

        self.check_done(string_to_parse)

        if retval.startswith("Testing changeset"):
            print "\n"

        self.bisectRecurse(testcondition=testcondition, args_for_condition=args_for_condition)