Ejemplo n.º 1
0
    def request(self, method, url, params=None, headers=None, to_json=True, data=None, **kwargs):
        """ Make request to TC API. """

        url, params, headers, data = self.prepare(url, params, headers, data)

        if self.options['cache']:
            rc.install_cache(self.options['cache'])

        elif type(self).cache_installed:
            rc.uninstall_cache()

        type(self).cache_installed = bool(self.options['cache'])

        try:
            response = rs.api.request(
                method, url, params=params, headers=headers, data=data, **kwargs)
            logger.debug(response.content)
            response.raise_for_status()
            if to_json:
                response = response.json()

        except (ValueError, rs.HTTPError):
            if locals().get('response') is not None:
                message = "%s: %s" % (response.status_code, response.content)
                raise TCException(message)
            raise

        return response
Ejemplo n.º 2
0
def get_api_keyboards(verbose=False):
	"""
	Get Keyboards data from web api.

	Args:
		verbose(bool, default False): verbose output
	Returns:
		dict: Keyboard data
		None: if http request not successful
	"""
	api_url = "https://api.keyman.com/cloud/4.0/keyboards?version=10.0"
	headers = {'Content-Type': 'application/json',
		'Accept-Encoding': 'gzip, deflate, br'}
	home = str(Path.home())
	cache_dir = keyman_cache_dir()
	current_dir = os.getcwd()
	expire_after = datetime.timedelta(days=1)
	if not os.path.isdir(cache_dir):
		os.makedirs(cache_dir)
	os.chdir(cache_dir)
	requests_cache.install_cache(cache_name='keyman_cache', backend='sqlite', expire_after=expire_after)
	now = time.ctime(int(time.time()))
	response = requests.get(api_url, headers=headers)
	if verbose:
		print("Time: {0} / Used Cache: {1}".format(now, response.from_cache))
	os.chdir(current_dir)
	if response.status_code == 200:
#		return json.loads(response.content.decode('utf-8'))
		return response.json()
	else:
		return None
Ejemplo n.º 3
0
    def setup(self):
        defaults = dict(name='Matkailu- ja kongressitoimisto')
        self.data_source, _ = DataSource.objects.get_or_create(id=self.name, defaults=defaults)
        self.tprek_data_source = DataSource.objects.get(id='tprek')

        ytj_ds, _ = DataSource.objects.get_or_create(defaults={'name': 'YTJ'}, id='ytj')

        org_args = dict(origin_id='0586977-6', data_source=ytj_ds)
        defaults = dict(name='Helsingin Markkinointi Oy')

        self.organization, _ = Organization.objects.get_or_create(
            defaults=defaults, **org_args)

        place_list = Place.objects.filter(data_source=self.tprek_data_source, deleted=False)
        deleted_place_list = Place.objects.filter(data_source=self.tprek_data_source,
                                                  deleted=True)
        # Get only places that have unique names
        place_list = place_list.annotate(count=Count('name_fi')).filter(count=1).values('id', 'origin_id', 'name_fi')
        deleted_place_list = deleted_place_list.annotate(count=Count('name_fi')).\
            filter(count=1).values('id', 'origin_id', 'name_fi', 'replaced_by_id')
        self.tprek_by_name = {p['name_fi'].lower(): (p['id'], p['origin_id']) for p in place_list}
        self.deleted_tprek_by_name = {
            p['name_fi'].lower(): (p['id'], p['origin_id'], p['replaced_by_id'])
            for p in deleted_place_list}

        if self.options['cached']:
            requests_cache.install_cache('matko')
Ejemplo n.º 4
0
    def reset_cache(self, cache_duration=None):
        """Remove any cached singles or albums charts

        Because the UK Top40 charts only change once per week, :py:class:`Top40` will cache the results of singles and
        albums. This means that during the execution of a program, repeated calls to retrieve singles and albums chart
        information will only actually call the remote API once. If, for whatever reason you need to ensure that an
        attempt to access single or album information actually results in a call to the remote API, then calling the
        :py:meth:`Top40.reset_cache` method will do this, by clearing down any existing cached chart information.

        If a cache is in place, then the results will also be cached across python runtime executions.

        Params:
            cache_duration (:py:class:`int`): If ``None`` we will uninstall the requests cache and the next
                read from the API will cause a remote call to be executed. Otherwise it specifies the number of
                seconds before the persistent cache will expire.
        """

        if cache_duration is None:
            # We are disabling the existing persistent_cache
            requests_cache.uninstall_cache()
        else:
            # We are setting a persistent cache so insert the duration into our cache config
            self.cache_config['expire_after'] = cache_duration

            # and then install the cache with this configuration
            requests_cache.install_cache(**self.cache_config)

        # Remember the new duration
        self.cache_duration = cache_duration

        # Rest the in-memory caches to force a read from remote site
        self._albums_chart = None
        self._singles_chart = None
Ejemplo n.º 5
0
def __get_session() -> requests.Session:
    """
    Get or create a requests session for MTGStocks.
    :return Session data
    """
    if mtgjson4.USE_CACHE.get():
        requests_cache.install_cache(
            "stocks_cache",
            backend="sqlite",
            expire_after=mtgjson4.SESSION_CACHE_EXPIRE_STOCKS,
        )

    session: Optional[requests.Session] = SESSION.get(None)
    if session is None:
        session = requests.Session()

        if mtgjson4.CONFIG_PATH.is_file():
            # Open and read MTGJSON secret properties
            config = configparser.RawConfigParser()
            config.read(mtgjson4.CONFIG_PATH)
            SESSION_TOKEN.set(config.get("MTGStocks", "token"))

        session = util.retryable_session(session)
        SESSION.set(session)
    return session
Ejemplo n.º 6
0
 def handle(self, *args, **options):
     self.logger = logging.getLogger(__name__)
     if options['cached']:
         import requests_cache
         requests_cache.install_cache("update-social")
     self.updater = FeedUpdater(self.logger)
     self.updater.update_feeds()
Ejemplo n.º 7
0
 def __init__(self, acs, tz):
     self.acs = acs
     self.tz = tz
     # Install sqlite cache for celestrak with a 24 hour duration
     # Good enough for celestrak and other data. Cache disabled when appropriate
     requests_cache.install_cache('teeminus10_cache', expire_after=24*60*60)
     requests_cache.clear()
Ejemplo n.º 8
0
def main():
    global args
    parser = argparse.ArgumentParser(description='从国家统计局网站下载最新的行政区')
    parser.add_argument('input', const="", default="", type=str, nargs="?")
    parser.add_argument("--sqlite3", type=str, help='SQLite文件位置')
    parser.add_argument("--mysql", type=str, help='mysql dsn')
    parser.add_argument('--mysql-host', type=str, help='mysql host')
    parser.add_argument('--mysql-port', type=str, help='mysql port')
    parser.add_argument('--mysql-user', type=str, help='mysql user')
    parser.add_argument('--mysql-password', type=str, help='mysql password')
    parser.add_argument('--mysql-database', type=str, help='mysql database')
    parser.add_argument('--skip-province', type=int, help='跳过省份的第x个')
    parser.add_argument('--verbose', '-v', action='count', help='打印日志内容')
    parser.add_argument('--dump', action='store', default='txt', \
        help='输出内容的格式 csv txt xml json jsonp')
    parser.add_argument('--dump-children', action='store_true', \
        help='打印子级内容')
    parser.add_argument('--region-type', action='store', default='province', \
        help='')
    parser.add_argument('--requests-cache', action='store', \
        default='/tmp/cnregion_requests_cache.sqlite')

    args = parser.parse_args(sys.argv[1:])
    requests_cache.install_cache(args.requests_cache)
    fetch.VERBOSE_LEVEL = args.verbose

    printer = Printer(args.dump)

    if args.region_type == "city":
    for province in fetch_provinces():
        print printer.province(province)

if "__main__" == __name__:
    main()
Ejemplo n.º 9
0
def pytest_configure(config):
    if config.getoption('--use-cache'):
        import requests_cache
        requests_cache.install_cache('test_cache')
    api = Api()
    pytest.game_ids = api.GetSeasonGameIDs('2009-10', 'Regular Season')[:2]  # Hack to carry the gameids to tests
    pytest.game_ids = ['0020900292']
Ejemplo n.º 10
0
 def __init__(self, cache, http_cfg):
     default_cfg = dict(stream=True, timeout=30.1)
     for it in default_cfg.items():
         http_cfg.setdefault(*it)
     self.config = DictLike(http_cfg)
     if cache:
         requests_cache.install_cache(**cache)
Ejemplo n.º 11
0
def get_vhosts(ip, first=1, no_cache=False):
    """Returns a list of webs hosted on IP (checks bing.com)
    >>> 'www.bing.com' in vhosts(204.79.197.200)
    True
    """

    if not no_cache:
        homedir = pwd.getpwuid(os.getuid()).pw_dir
        requests_cache.install_cache(homedir + '/.habu_requests_cache')

    url = "http://www.bing.com/search?q=ip:{ip}&first={first}".format(ip=ip, first=first)

    response = requests.get(url)

    soup = BeautifulSoup(response.text, "html.parser")

    vhosts = set()

    for h2 in soup.find_all('h2'):
        for link in h2.find_all('a'):
            href = link.get('href')

            if href.startswith('http://') or href.startswith('https://'):
                vhost = href.split('/')[2]
                vhosts.add(vhost)

    return list(vhosts)
Ejemplo n.º 12
0
 def setUp(self):
     requests_cache.install_cache(
         cache_name=os.path.join(os.path.dirname(__file__), "test"),
         allowable_methods=('GET', 'POST')
     )
     self.ts_beg = datetime.datetime(2015, 3, 5, 0)
     self.ts_end = datetime.datetime(2015, 3, 5, 3)
Ejemplo n.º 13
0
    def command(self):
        self._load_config()

        import ckan.model as model

        # Cache all HTTP requests for 24 hours
        requests_cache.install_cache('opennames_cache', expire_after=86400)

        rev = model.repo.new_revision()

        print "Processing Organisations"
        for entities in self.opennames_entity_generator():
            for entity in self.closed_generator(entities):
                group = model.Group.get(entity)
                if not group:
                    print "Group {} does not exist".format(entity)
                    continue

                print "Updating {}".format(entity)
                group.extras['closed'] = True
                model.Session.add(group)
                model.Session.commit()

        print "Processing PCTs"
        for trust in self.pcts():
            print "Updating {}".format(trust.name)
            trust.extras['closed'] = True
            trust.extras['replaced_by'] = "national-health-service"
            model.Session.add(trust)
            model.Session.commit()

        model.repo.commit_and_remove()
Ejemplo n.º 14
0
def enable_cache(fileprefix, cachetype, expiry):
    """
    If the requests_cache package is available, install a cache and
    begin using it globally. Returns True if caching was successfully
    enabled, and False otherwise (failed to enable, or enabled
    already)
    """

    global _CACHE_INSTALLED

    if _CACHE_INSTALLED:
        return False

    try:
        from requests_cache import install_cache
        from requests_cache.core import remove_expired_responses

        install_cache(fileprefix, cachetype, expire_after=expiry)
        remove_expired_responses()

    except ImportError:
        return False

    else:
        _CACHE_INSTALLED = True
        return True
Ejemplo n.º 15
0
def cmd_crtsh(domain, no_cache, no_validate, verbose):
    """Downloads the certificate transparency logs for a domain
    and check with DNS queries if each subdomain exists.

    Uses multithreading to improve the performance of the DNS queries.

    Example:

    \b
    $ sudo habu.crtsh securetia.com
    [
        "karma.securetia.com.",
        "www.securetia.com."
    ]
    """

    if verbose:
        logging.basicConfig(level=logging.INFO, format='%(message)s')

    if not no_cache:
        homedir = pwd.getpwuid(os.getuid()).pw_dir
        requests_cache.install_cache(homedir + '/.habu_requests_cache', expire_after=3600)

    subdomains = set()

    if verbose:
        print("Downloading subdomain list from https://crt.sh ...", file=sys.stderr)

    req = requests.get("https://crt.sh/?q=%.{d}&output=json".format(d=domain))

    if req.status_code != 200:
        print("[X] Information not available!")
        exit(1)

    json_data = json.loads(req.text)

    for data in json_data:
        name = data['name_value'].lower()
        if '*' not in name:
            subdomains.add(name)

    subdomains = list(subdomains)

    if no_validate:
        print(json.dumps(sorted(subdomains), indent=4))
        return True

    if verbose:
        print("Validating subdomains against DNS servers ...", file=sys.stderr)

    answers = query_bulk(subdomains)

    validated = []

    for answer in answers:
        if answer:
            validated.append(str(answer.qname))

    print(json.dumps(sorted(validated), indent=4))
    return True
Ejemplo n.º 16
0
def crawl_command(args):
    requests_cache.install_cache('builder_stats')

    CBE_BASE = 'https://chrome-build-extract.appspot.com'
    MASTERS_URL = 'https://chrome-infra-stats.appspot.com/_ah/api/stats/v1/masters'
    master_names = requests.get(MASTERS_URL).json()['masters']

    builder_stats = []

    for master_name in master_names:
        cbe_master_url = '%s/get_master/%s' % (CBE_BASE, master_name)
        master_json = requests.get(cbe_master_url).json()
        # print master_json['slaves'].keys()
        for builder_name, builder_json in master_json['builders'].items():
            cbe_builds_url = '%s/get_builds' % CBE_BASE
            params = { 'master': master_name, 'builder': builder_name }
            response_json = requests.get(cbe_builds_url, params=params).json()
            builds = response_json['builds']
            if builds:
                finished_build = next(b for b in builds if b['eta'] is None)
                first_step_name = finished_build['steps'][0]['name']
            else:
                first_step_name = None
            builder_tuple = (master_name, builder_name, first_step_name, builder_json['slaves'])
            print builder_tuple
            builder_stats.append(builder_tuple)

    with open('builder_stats.json', 'w') as stats_file:
        json.dump(builder_stats, stats_file)
def test_fred():

    filename = "fred"

    if expire_after>=0:
        requests_cache.install_cache(filename, backend='sqlite', expire_after=expire_after) # expiration seconds
        logging.info("Installing cache '%s.sqlite' with expire_after=%d (seconds)" % (filename, expire_after))
    if expire_after==0:
        logging.warning("expire_after==0 no cache expiration!")

    start = datetime.datetime(2010, 1, 1)
    end = datetime.datetime(2013, 1, 27)

    #name = "GDP"
    #name = "CPIAUCSL"
    #name = "CPILFESL"
    name = ["CPIAUCSL", "CPILFESL"]
    #name = ["CPIAUCSL", "CPILFESL", "ERROR"]


    data = MyDataReader("FRED").get(name, start, end)
    print(data)

    gdp = web.DataReader(name, "fred", start, end)

    print(gdp)
    print(type(gdp))
    print(gdp.ix['2013-01-01'])
    print(gdp.dtypes)

    diff = gdp - data
    assert(diff.sum().sum()==0)
Ejemplo n.º 18
0
def cmd_cymon_ip_timeline(ip, no_cache, verbose, output, pretty):
    """Simple cymon API client.

    Prints the JSON result of a cymon IP timeline query.

    Example:

    \b
    $ habu.cymon.ip.timeline 8.8.8.8
    {
        "timeline": [
            {
                "time_label": "Aug. 18, 2018",
                "events": [
                    {
                        "description": "Posted: 2018-08-18 23:37:39 CEST IDS Alerts: 0 URLQuery Alerts: 1 ...",
                        "created": "2018-08-18T21:39:07Z",
                        "title": "Malicious activity reported by urlquery.net",
                        "details_url": "http://urlquery.net/report/b1393866-9b1f-4a8e-b02b-9636989050f3",
                        "tag": "malicious activity"
                    }
                ]
            },
            ...
    """

    habucfg = loadcfg()

    if 'CYMON_APIKEY' not in habucfg:
        print('You must provide a cymon apikey. Use the ~/.habu.json file (variable CYMON_APIKEY), or export the variable HABU_CYMON_APIKEY')
        print('Get your API key from https://www.cymon.io/')
        sys.exit(1)

    if verbose:
        logging.basicConfig(level=logging.INFO, format='%(message)s')

    if not no_cache:
        homedir = pwd.getpwuid(os.getuid()).pw_dir
        requests_cache.install_cache(homedir + '/.habu_requests_cache')

    url = 'https://www.cymon.io:443/api/nexus/v1/ip/{}/timeline/'.format(ip)
    headers = { 'Authorization': 'Token {}'.format(habucfg['CYMON_APIKEY']) }

    r = requests.get(url, headers=headers)

    if r.status_code not in [200, 404]:
        print('ERROR', r)
        return False

    if r.status_code == 404:
        print("Not Found")
        return False

    data = r.json()

    if pretty:
        output.write(pretty_print(data))
    else:
        output.write(json.dumps(data, indent=4))
        output.write('\n')
Ejemplo n.º 19
0
    def _query_api(self):
        """
        Get data from MTA Service api endpoint.
        :return:
        """
        endpoint = "http://web.mta.info/status/serviceStatus.txt"
        requests_cache.install_cache('transit-cache', backend='sqlite', expire_after=180)
        raw_xml_data = requests.get(endpoint).text
        data = xmltodict.parse(raw_xml_data, dict_constructor=dict)
        response_code = data['service']['responsecode']

        if int(response_code) == 0:
            payload = {
                'Subway': {"name": "Subway", "status": self._parse_transit(data['service']['subway'])},
                'MTA Buses': {"name": "MTA Buses", "status": self._parse_transit(data['service']['bus'])},
                'Bridges & Tunnels': {"name": 'Bridges & Tunnels', "status": self._parse_transit(data['service']['BT'])},
                'LIRR': {"name": 'LIRR', "status": self._parse_transit(data['service']['LIRR'])},
                'Metro North': {"name": 'Metro North', "status": self._parse_transit(data['service']['MetroNorth'])},
            }

        else:
            payload = None
            #TODO: Raise a warning that response code was non-zero

        return payload
Ejemplo n.º 20
0
 def test_expire_after_installed(self):
     requests_cache.install_cache(name=CACHE_NAME, backend=CACHE_BACKEND)
     requests_cache.expire_after('http://httpbin.org/get', 2)        
     r = requests.get('http://httpbin.org/get')
     self.assertFalse(r.from_cache)
     r = requests.get('http://httpbin.org/get')
     self.assertTrue(r.from_cache)
    def run(self, cache=True):
        """Run application."""

        self._query()

        # configure `requests` cache
        if cache:
            cache_dir = appdirs.user_cache_dir('craigslist')
            os.makedirs(cache_dir, exist_ok=True)
            requests_cache.install_cache(
                cache_name=os.path.join(cache_dir, 'craigslist'),
                expire_after=timedelta(hours=0.5))

        print('Running query...\n')

        # record the start time
        start = time.time()

        self.prices = self._getprices()

        # determine elapsed time of queries
        self.duration = time.time() - start

        # remove expired cache entries
        if cache:
            requests_cache.core.remove_expired_responses()

        # print statistics (if any price data exists)
        if self.prices:
            self._print()
        else:
            print('Nothing found for that search.')
Ejemplo n.º 22
0
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None):
    """Performs a start/stop/restart/status/scale on an instance
    :param command: String of start, stop, restart, status or scale
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: bool if the output should be verbose or not
    :returns: A unix-style return code
    """
    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = marathon_tools.create_complete_config(service, instance, marathon_config, soa_dir=soa_dir)['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'start':
        start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster)
    elif command == 'stop':
        stop_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'restart':
        restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster)
    elif command == 'status':
        # Setting up transparent cache for http API calls
        requests_cache.install_cache('paasta_serviceinit', backend='memory')

        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id, normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose:
            print status_mesos_tasks_verbose(app_id, get_short_task_id)
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose,
            )
    elif command == 'scale':
        scale_marathon_job(service, instance, app_id, delta, client, cluster)
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Ejemplo n.º 23
0
def install_cache(expire_after=12 * 3600):
    """
    Patches the requests library with requests_cache.
    """
    requests_cache.install_cache(
        expire_after=expire_after,
        allowable_methods=('GET',))
Ejemplo n.º 24
0
def main():
    requests_cache.install_cache("british-library-catalog")
    session = requests_cache.CachedSession()
    session.hooks = {'response': make_throttle_hook(0.5)} # Be polite - less than 2 req/sec

    columns = '\t'.join(['Print ID', 'Scan ID', 'DOM IDs'])
    if FETCH_ARKS:
        columns += '\tARKs'
    print(columns)

    with open('metadata/booklist.tsv') as infile:
        for line in infile:
            if line.startswith('Aleph'): #skip header
                continue
            line = line.rstrip('\n')
            digitalId = line.split('\t')[0]
            originalId = getPrintId(session, digitalId)

            # This (disabled) section of code will translate lsid to ARK
            # in case we ever need it.  Right now the BL Viewer accepts raw
            # lsids, so it's unnecessary
            lsids = line.split('\t')[-1].split(' -- ')
            arks = []
            if FETCH_ARKS:
                for lsid in lsids:
                    ark = getARK(session, lsid)
                    arks.append(ark)

            print('\t'.join([str(originalId), digitalId,','.join(lsids),','.join(arks)]))
Ejemplo n.º 25
0
def install_cache_requests():
    requests_cache.install_cache(**{
        'allowable_methods': ('GET', 'HEAD'),
        'cache_name': conf.REQUESTS_CACHE,
        'backend': 'sqlite',
        'fast_save': conf.ASYNC_CACHE_WRITES,
        'extension': '.sqlite3'})
Ejemplo n.º 26
0
  def __init__(self, api_key, response_format='json'):
    super(OMIM, self).__init__()
    self.base_url = 'http://api.omim.org/api'
    self.format = response_format
    self.api_key = api_key

    requests_cache.install_cache('omim_cache', backend='sqlite', expire_after=8460000)
Ejemplo n.º 27
0
def get_keyboard_data(keyboardID, weekCache=False):
	"""
	Get Keyboard or package data from web api.

	Args:
		keyboardID (str): Keyboard or package ID
		weekCache (bool) : cache data for 1 week, default is 1 day
	Returns:
		dict: Keyboard data
	"""
	logging.info("Getting data for keyboard %s", keyboardID)
	api_url = "https://api.keyman.com/keyboard/" + keyboardID
	logging.debug("At URL %s", api_url)
	home = str(Path.home())
	cache_dir = keyman_cache_dir()
	current_dir = os.getcwd()
	if weekCache:
		expire_after = datetime.timedelta(days=7)
	else:
		expire_after = datetime.timedelta(days=1)
	os.chdir(cache_dir)
	requests_cache.install_cache(cache_name='keyman_cache', backend='sqlite', expire_after=expire_after)
	now = time.ctime(int(time.time()))
	response = requests.get(api_url)
	logging.debug("Time: {0} / Used Cache: {1}".format(now, response.from_cache))
	os.chdir(current_dir)
	requests_cache.core.uninstall_cache()
	if response.status_code == 200:
		return response.json()
	else:
		return None
Ejemplo n.º 28
0
def main(argv=None):
    args = parse_paasta_api_args()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    if args.soa_dir:
        settings.soa_dir = args.soa_dir

    # Exit on exceptions while loading settings
    settings.cluster = load_system_paasta_config().get_cluster()

    marathon_config = marathon_tools.load_marathon_config()
    settings.marathon_client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password()
    )

    # Set up transparent cache for http API calls. With expire_after, responses
    # are removed only when the same request is made. Expired storage is not a
    # concern here. Thus remove_expired_responses is not needed.
    requests_cache.install_cache("paasta-api", backend="memory", expire_after=30)

    server = WSGIServer(('', int(args.port)), make_app())
    log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir))

    try:
        server.serve_forever()
    except KeyboardInterrupt:
        sys.exit(0)
Ejemplo n.º 29
0
    def handle(self, *args, **options):
        if options['cached']:
            requests_cache.install_cache('resources_import')

        importers = get_importers()
        imp_list = ', '.join(sorted(importers.keys()))
        imp_name = options.get('module')
        if not imp_name:
            raise CommandError("Enter the name of the importer module. Valid importers: %s" % imp_list)
        if imp_name not in importers:
            raise CommandError("Importer %s not found. Valid importers: %s" % (args[0], imp_list))
        imp_class = importers[imp_name]
        importer = imp_class(options)

        # Activate the default language for the duration of the import
        # to make sure translated fields are populated correctly.
        default_language = settings.LANGUAGES[0][0]
        for imp_type in self.importer_types:
            name = "import_%s" % imp_type
            method = getattr(importer, name, None)
            if options[imp_type]:
                if not method:
                    raise CommandError("Importer %s does not support importing %s" % (name, imp_type))
            else:
                if not options['all']:
                    continue

            if method:
                with override(default_language), transaction.atomic():
                    kwargs = {}
                    url = options.pop('url', None)
                    if url:
                        kwargs['url'] = url
                    method(**kwargs)
Ejemplo n.º 30
0
def cli(debug):
    log_level = logging.INFO
    requests_cache.install_cache('fr_cache', expire_after=60*60*24*3)  # 3 days
    if debug:
        log_level = logging.DEBUG
        sys.excepthook = lambda t, v, tb: ipdb.post_mortem(tb)
    coloredlogs.install(level=log_level, fmt="%(levelname)s %(message)s")
Ejemplo n.º 31
0
    pass

import os
import re
import sys

import requests
import getpass
import json

try:
    import requests_cache
except ImportError:
    print("no cache")
else:
    requests_cache.install_cache("gh_api")

# Keyring stores passwords by a 'username', but we're not storing a username and
# password
fake_username = '******'


class Obj(dict):
    """Dictionary with attribute access to names."""
    def __getattr__(self, name):
        try:
            return self[name]
        except KeyError:
            raise AttributeError(name)

    def __setattr__(self, name, val):
Ejemplo n.º 32
0
# -*- coding: utf-8 -*-

import json
import math
import os
import random
import re
import time
import urllib.parse

import requests
import requests_cache
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz, process

requests_cache.install_cache('worldstate_cache', expire_after=60)


def get_worldstate():
    """Get world state json.

    Return a very complicated nested array
    """
    wsurl = 'http://content.warframe.com/dynamic/worldState.php'
    ws = requests.get(wsurl, timeout=30).json()
    return ws


data_files = {
    'solNodes.json': 'S',
    'languages.json': 'L',
Ejemplo n.º 33
0
def license_check(f):
    requests_cache.install_cache('github_cache',
                                 backend='sqlite',
                                 expire_after=3600000)
    token = get_token()
    list_all_repos = all_repositories(token, f)
    list_repos_without_license_file = []
    list_without_licence = []

    for every_repo in list_all_repos:
        print('REPO: ', every_repo, file=f)
        contents_url = json_parsing(
            'https://api.github.com/repos/apiaryio/' + every_repo +
            '/contents', token, f)
        list_licenses_compare = []
        found_license_file = False
        found_license = False
        for filename in contents_url:
            if re_findall(filename['name'], r'\blicen[sc]e[sd]?'):
                found_license_file = True
                r = search_license_type_in_license_file(filename, token, f)
                list_licenses_compare.append((filename['name'], r))
                print(filename['name'], ': ', r, file=f)
                if r:
                    found_license = True

            if re_findall(filename['name'],
                          r'\breadme') or filename['name'] == 'package.json':
                v = search_license_type_in_readme_packagejson(
                    filename, token, f)
                list_licenses_compare.append((filename['name'], v))
                print(filename['name'], ': ', v, file=f)
                if v:
                    found_license = True

            package_json_dependencies(every_repo, f, filename, token)

        if not found_license_file:
            list_repos_without_license_file.append(every_repo)
        if not found_license:
            list_without_licence.append(every_repo)

        compare_pull_requests_and_master(every_repo, f, list_licenses_compare,
                                         token)

        print('---------------------------------------', file=f)
    print()
    print('REPOSITORIES TOTAL: {}'.format(len(list_all_repos)))
    print('REPOSITORIES TOTAL: {}'.format(len(list_all_repos)), file=f)
    print()
    print(
        'REPOSITORIES WITHOUT LICENSE FILE: {}'.format(
            len(list_repos_without_license_file)), ':',
        list_repos_without_license_file)
    print('REPOSITORIES WITHOUT LICENSE FILE: {}'.format(
        len(list_repos_without_license_file)),
          ':',
          list_repos_without_license_file,
          file=f)
    print()
    print('REPOSITORIES WITHOUT LICENSE: {}'.format(len(list_without_licence)),
          ':', list_without_licence)
    print('REPOSITORIES WITHOUT LICENSE: {}'.format(len(list_without_licence)),
          ':',
          list_without_licence,
          file=f)
    print()
    f.close()
Ejemplo n.º 34
0
import requests_cache, imghdr

from validators import validate_raw_files
from create_csvs import create_csvs

from ers import all_keywords_aus as keywords, fpath_namer, mh_brands, clean_url, headers
from matcher import BrandMatcher
from ers import COLLECTION_DATE, file_hash, img_path_namer
import shutil
from parse import parse
from custom_browser import CustomDriver

# Init variables and assets
shop_id = 'liquor_land'
root_url = 'https://www.liquorland.com.au'
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = 'AUS'
searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True, download_images=False)


def getprice(pricestr):
    if not pricestr:
        return
    pricestr = pricestr.replace('$', '')
    price = parse('{pound:d}', pricestr)
    if price:
        return price.named['pound'] * 100
    price = parse('{pound:d}.{pence:d}', pricestr)
    if price:
        return price.named['pound'] * 100 + price.named['pence']
Ejemplo n.º 35
0
Archivo: api.py Proyecto: salvete/test
from collections import OrderedDict
from http.cookiejar import LWPCookieJar
from http.cookiejar import Cookie

import platform
import time
import requests
import requests_cache

from config import Config
from const import Constant
from storage import Storage
from encrypt import encrypted_request
import logger

requests_cache.install_cache(Constant.cache_path, expire_after=3600)

log = logger.getLogger(__name__)

# 歌曲榜单地址
TOP_LIST_ALL = {
    0: ["云音乐新歌榜", "3779629"],
    1: ["云音乐热歌榜", "3778678"],
    2: ["网易原创歌曲榜", "2884035"],
    3: ["云音乐飙升榜", "19723756"],
    4: ["云音乐电音榜", "10520166"],
    5: ["UK排行榜周榜", "180106"],
    6: ["美国Billboard周榜", "60198"],
    7: ["KTV嗨榜", "21845217"],
    8: ["iTunes榜", "11641012"],
    9: ["Hit FM Top榜", "120001"],
Ejemplo n.º 36
0
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

from BTG import BTG
from lib.io import display
from requests import get
from re import findall
import config
import os
from platform import system
if system() != "Windows":
    import requests_cache
    requests_cache.install_cache('%sBTG' % config.sqlite_path)


class Malekal:
    """
        This module allow you to search IOC in Malekal website (HTTP Requests)
        or local directory specified in BTG configuration file.
    """
    def __init__(self, ioc, type):
        if config.malekal_enabled:
            self.module_name = __name__.split(".")[1]
            if config.malekal_local and not config.malekal_remote:
                self.types = ["MD5"]
            else:
                self.types = [
                    "MD5", "SHA1", "SHA256", "SHA512", "URL", "IPv4", "IPv6",
Ejemplo n.º 37
0
import io
import re

import pandas as pd
import requests
import requests_cache

KEGG_HOST = 'http://rest.kegg.jp'

KEGG_PATHWAY_URL = '/list/pathway/{species}'
KEGG_GENE_URL = '/list/{species}'
KEGG_MAPPING_URL = '/link/pathway/{species}'
KEGG_CONV_URL = '/conv/ncbi-geneid/{species}'

requests_cache.install_cache('_kegg_cache')


def get_gene_sets(species, use_kegg_ids=False, use_name=True):
    """Gets mapping from gene ids to pathways as genesets.

    Args:
        species (str): Name of the species to query. Use or example 'hsa'
            for human or 'mmu' for mouse.
        use_kegg_ids (bool): Whether to return gene ids as entrez
            ids (False), or as KEGG gene ids (True).
        use_name (bool): Whether to use the names of the pathways in the
            geneset dict (True), or the pathway ids (False).

    Returns:
        dict of sets: Dict mapping pathways (the keys) to gene ids (the sets).
Ejemplo n.º 38
0
from collections import Counter
from collections import OrderedDict
import csv
import functools
from functools import reduce
from joblib import Parallel, delayed
import json
import multiprocessing
from networkx import *
import operator
import pandas as pd
import requests
import requests_cache

pd.options.mode.chained_assignment = None
requests_cache.install_cache('demo_cache')


def Mass2Motif_2_Network(edges, motifs, prob=0.01, overlap=0.3, top=5):
    """Map Mass2Motifs onto a mass spectral molecular network

    :param edges: An edges file downloaded from GNPS 
    :type edges: pandas.core.frame.DataFrame
    :param motifs: A motif summary file downloaded from MS2LDA
    :type motifs: pandas.core.frame.DataFrame
    :param prob: Minimal probability score for a Mass2Motif to be included 
    :type prob: float
    :param overlap: Minimal overlap score for a Mass2Motif to be included
    :type overlap: float
    :param top: Specifies how many most shared motifs per molecular family (network component index) should be shown
    :type top: int
Ejemplo n.º 39
0
def install_cache():
    requests_cache.install_cache('pbr_mlb')
Ejemplo n.º 40
0
import sys
from Orangeboard import Orangeboard
from BioNetExpander import BioNetExpander
from QueryNCBIeUtils import QueryNCBIeUtils
from QuerySciGraph import QuerySciGraph
from QueryDisont import QueryDisont
from ParsePhenont import ParsePhenont
from QueryChEMBL import QueryChEMBL
from QueryPubChem import QueryPubChem

import pandas
import timeit
import argparse

# configure requests package to use the "orangeboard.sqlite" cache
requests_cache.install_cache('orangeboard')

# create an Orangeboard object
ob = Orangeboard(debug=True)

# configure the Orangeboard for Neo4j connectivity
ob.neo4j_set_url()
ob.neo4j_set_auth()

bne = BioNetExpander(ob)


def add_pc2_to_kg():
    sif_data = pandas.read_csv(
        '../../../data/pc2/PathwayCommons9.All.hgnc.sif',
        sep='\t',
Ejemplo n.º 41
0
def enable_cache():
    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)
    requests_cache.install_cache(CACHE_FILE)
Ejemplo n.º 42
0
from django.core.management.base import BaseCommand, CommandError
from pokesearch.models import Pokemon, PokemonSpecies, Type, PokemonType

import requests
import requests_cache

requests_cache.install_cache('pokesearch_cache')


class Command(BaseCommand):
    help = "Get the list of Pokemons from PokeAPI"

    def add_arguments(self, parser):
        parser.add_argument("-f", "--from", type=int, default=1)
        parser.add_argument("-t", "--to", type=int, default=899)

    def handle(self, *args, **options):
        _from = options["from"]
        _to = options["to"]
        for pokemon_id in range(_from, _to):
            pokemon = requests.get(
                f"https://pokeapi.co/api/v2/pokemon/{pokemon_id}")
            pokemon_obj, pokemon_created = Pokemon.objects.get_or_create(
                id=pokemon.json()["id"], name=pokemon.json()["name"])
            species = requests.get(pokemon.json()["species"]['url'])
            species_obj, species_created = PokemonSpecies.objects.get_or_create(
                id=species.json()["id"], name=species.json()["name"])
            pokemon_obj.species = species_obj
            for pokemon_type in pokemon.json()["types"]:
                type_ = requests.get(pokemon_type["type"]["url"])
                type_obj, type_created = Type.objects.get_or_create(
Ejemplo n.º 43
0
from flask import Flask, render_template, request, jsonify
import plotly.graph_objs as go
from plotly.utils import PlotlyJSONEncoder
import json
import requests
import requests_cache

requests_cache.install_cache('crime_api_cache',
                             backend='sqlite',
                             expire_after=36000)

app = Flask(__name__)

crime_url_template = 'https://data.police.uk/api/crimes-street/all-crime?lat={lat}&lng={lng}&date={data}'
categories_url_template = 'https://data.police.uk/api/crime-categories?date={date}'


@app.route('/crimestat', methods=['GET'])
def crimechart():
    my_latitude = request.args.get('lat', '51.52369')
    my_longitude = request.args.get('lng', '-0.0395857')
    my_date = request.args.get('date', '2018-11')

    categories_url_template = ' https://data.police.uk/api/crime-categories?date={date} '
    resp = requests.get(categories_url_template.format(date=my_date))
    if resp.ok:
        categories_json = resp.json()
    else:
        print(resp.reasone)
    categories = {categ["url"]: categ["name"] for categ in categories_json}
    crime_category_stats = dict.fromkeys(categories.keys(), 0)
Ejemplo n.º 44
0
from json import loads
from datetime import datetime

from requests import get
from requests_cache import install_cache

from .constants import VALID_POSITIONS, BASE_URL, ONE_HOUR

install_cache('nfl_api_cache', expire_after=ONE_HOUR)


def gather_json(week=None, season=None, position=None):
    nfl_api_url = _format_url(week, season, position)
    try:
        response = get(nfl_api_url)
        return loads(response.text)['players']
    except:
        raise Exception('Error retrieving data from NFL api')


def _format_url(week, season, position):
    type = 'weekStats'
    if not season:
        season = _get_default_season()
    if not week:
        type = 'seasonStats'
        week_string = ''
    else:
        week_string = '&week={}'.format(week)
    nfl_api_url = BASE_URL.format(type, season, week_string)
    if position in VALID_POSITIONS:
Ejemplo n.º 45
0
from flask import Flask, jsonify, request, make_response
import requests
from app.post import Post, PostSchema
from operator import attrgetter
from pprint import pprint
import time, json
from threading import Thread
import queue
import requests_cache

app = Flask(__name__)
requests_cache.install_cache('api_cache',
                             backend='sqlite',
                             expire_after=2 * 60)


# Route 1
@app.route('/api/ping', methods=['GET'])
def ping():
    return jsonify(success=True), 200


# Route 2
@app.route('/api/posts', methods=['GET'])
def get_posts():
    # Get URL arguements to pass along
    tags = request.args.get('tags')

    # Set default if arg is None
    sortBy = request.args.get('sortBy') or 'id'
Ejemplo n.º 46
0
def set_request_cache():
    if not os.path.exists('_cache'):
        os.mkdir('_cache')
    requests_cache.install_cache('_cache/page_cache', backend='sqlite',
                                 expire_after=10800)
Ejemplo n.º 47
0
import requests
import requests_cache

requests_cache.install_cache('m2m_cache', expire_after=86400)


class MachineToMachine(object):
    def __init__(self, base_url, api_user, api_key):
        self.base_url = base_url
        self.api_user = api_user
        self.api_key = api_key
        self.inv_url = self.base_url + '/api/m2m/12576/sensor/inv'

        cache_name = 'm2m_%s_cache' % base_url.replace('https://', '')
        requests_cache.install_cache(cache_name, expire_after=86400)

    def toc(self):
        url = self.inv_url + '/toc'
        return requests.get(url, auth=(self.api_user, self.api_key)).json()

    def node_inventory(self, subsite, node):
        url = '/'.join((self.inv_url, subsite, node))
        return [
            '-'.join((subsite, node, sensor))
            for sensor in requests.get(url, auth=(self.api_user,
                                                  self.api_key)).json()
        ]

    def streams(self):
        toc = self.toc()
        stream_map = {}
Ejemplo n.º 48
0
stores=[1966,1923, 1857,1886,1871,1767,1838,1790,1823,1917]


import requests
import requests_cache
import json
from pprint import pprint
requests_cache.install_cache('datagram')
url = "https://datagram-products-v1.p.mashape.com/stores/1870/products/"#1871
headers = {
    'x-mashape-key': "vJpcBxsOd0mshQhiA5WzWt780Qx0p1ZR1vzjsnl3zHA9dKPcuf",
    'accept':       "application/json",
    'cache-control': "no-cache",
    'postman-token': "6d56d562-8fa1-fa04-da55-42ad69a4a23e"
    }

def get_chains():
    url = "https://datagram-products-v1.p.mashape.com/chains/"
    r = requests.get( url, headers=headers)
    return json.loads(r.text)


def get_stores(chain):
    url = "https://datagram-products-v1.p.mashape.com/chains/"+str(str(c['id']))+"/stores/"
    r = requests.get(url, headers=headers)
    res=json.loads(r.text)
    if r.status_code!=200:
        print(r.text)
    print(r.from_cache,r.status_code,chain['name'],len(res))
    return res
Ejemplo n.º 49
0
def main():
    args = parse_args()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    instances = []
    return_codes = []
    command = args.command
    if (args.service_instance):
        service_instance = args.service_instance
        service, instance, _, __ = decompose_job_id(service_instance)
        instances.append(instance)
    elif (args.service and args.instances):
        service = args.service
        instances = args.instances.split(',')
    else:
        log.error("The name of service or the name of instance to inspect is missing. Exiting.")
        sys.exit(1)

    # Setting up transparent cache for http API calls
    requests_cache.install_cache("paasta_serviceinit", backend="memory")

    cluster = load_system_paasta_config().get_cluster()
    actual_deployments = get_actual_deployments(service, args.soa_dir)
    clients = PaastaClients(cached=(command == 'status'))

    instance_types = ['marathon', 'chronos', 'paasta_native', 'adhoc']
    instance_types_map = {it: [] for it in instance_types}
    for instance in instances:
        try:
            instance_type = validate_service_instance(
                service, instance, cluster, args.soa_dir,
            )
        except Exception:
            log.error(
                (
                    'Exception raised while looking at service %s instance %s:'
                ).format(service, instance),
            )
            log.error(traceback.format_exc())
            return_codes.append(1)
            continue

        if instance_type not in instance_types:
            log.error(
                (
                    "I calculated an instance_type of {} for {} which I don't "
                    "know how to handle."
                ).format(
                    instance_type, compose_job_id(service, instance),
                ),
            )
            return_codes.append(1)
        else:
            instance_types_map[instance_type].append(instance)

    remote_run_frameworks = None
    if len(instance_types_map['adhoc']) > 0:
        remote_run_frameworks = paasta_remote_run.remote_run_frameworks()

    for instance_type in instance_types:
        for instance in instance_types_map[instance_type]:
            try:
                version = get_deployment_version(
                    actual_deployments, cluster, instance,
                )
                paasta_print('instance: %s' % PaastaColors.blue(instance))
                paasta_print('Git sha:    %s (desired)' % version)

                if instance_type == 'marathon':
                    return_code = marathon_serviceinit.perform_command(
                        command=command,
                        service=service,
                        instance=instance,
                        cluster=cluster,
                        verbose=args.verbose,
                        soa_dir=args.soa_dir,
                        app_id=args.app_id,
                        delta=args.delta,
                        client=clients.marathon(),
                    )
                elif instance_type == 'chronos':
                    return_code = chronos_serviceinit.perform_command(
                        command=command,
                        service=service,
                        instance=instance,
                        cluster=cluster,
                        verbose=args.verbose,
                        soa_dir=args.soa_dir,
                        client=clients.chronos(),
                    )
                elif instance_type == 'paasta_native':
                    return_code = paasta_native_serviceinit.perform_command(
                        command=command,
                        service=service,
                        instance=instance,
                        cluster=cluster,
                        verbose=args.verbose,
                        soa_dir=args.soa_dir,
                    )
                elif instance_type == 'adhoc':
                    if instance == 'interactive':
                        raise NotImplementedError
                    if command != 'status':
                        raise NotImplementedError

                    paasta_remote_run.remote_run_list_report(
                        service=service,
                        instance=instance,
                        cluster=cluster,
                        frameworks=remote_run_frameworks,
                    )
                    return_code = 0
            except Exception:
                log.error(
                    (
                        'Exception raised while looking at service {} '
                        'instance {}:'
                    ).format(service, instance),
                )
                log.error(traceback.format_exc())
                return_code = 1

            return_codes.append(return_code)

    sys.exit(max(return_codes))
Ejemplo n.º 50
0
from feedparser.feedFetch import feedToJSON
import requests_cache
import time
import concurrent.futures
from dateutil.parser import parse
import threading
import configs as cf

# to use configs.json from external source
# import requests

requests_cache.install_cache('feedscache', backend='sqlite', expire_after=1200)

# using configs from configs.py
configs = cf.configs

###########
# using configs.json from external source
# r = requests.get("CONFIGS.JSON URL")
# configs = r.json()

urllist = configs['sources'].values()

# feedList refreshed every 660 seconds
newlist = []

# final feed List (updates with newlist)
finallist = []


def getTimestamp(pubdate):
Ejemplo n.º 51
0
import apis
import json
import requests
import requests_cache
import settings

requests_cache.install_cache(cache_name='hive_api',
                             backend='sqlite',
                             expire_after=60)


def get_hive_sessionId():
    payload = "{\r\n    \"sessions\": [{\r\n        \"username\": \"" + apis.username + "\",\r\n        \"password\": \"" + apis.password + "\",\r\n        \"caller\": \"WEB\"\r\n    }]\r\n}"
    headers = {
        'Content-Type': "application/json",
        'Accept': "application/vnd.alertme.zoo-6.1+json",
        'X-Omnia-Client': "Hive Web Dashboard",
        'User-Agent': "PostmanRuntime/7.18.0",
        'Cache-Control': "no-cache",
        'Host': "api.prod.bgchprod.info:443",
        'Accept-Encoding': "gzip, deflate",
        'Content-Length': "139",
        'Connection': "keep-alive",
        'cache-control': "no-cache"
    }

    response = requests.request("POST",
                                apis.hive_login_url,
                                data=payload,
                                headers=headers)
Ejemplo n.º 52
0
app = Flask(__name__)
Markdown(app)

# some links for jenkins
URL_METADATA = "https://repo.codemc.io/repository/maven-releases/world/bentobox/{module}/maven-metadata.xml"
URL_VERSION_INFO = "https://repo.codemc.io/repository/maven-releases/world/bentobox/{module}/{version}/{module}-{version}.pom"
URL_JAR_DOWNLOAD = "https://repo.codemc.io/repository/maven-releases/world/bentobox/{module}/{version}/{module}-{version}.jar"

# bentobox static addon list
BENTOBOX_ADDONS = open('addons.txt', 'r').read().splitlines()

CACHE_FILE_SECONDS = 60 * 10

requests_cache.install_cache('nexus_cache',
                             backend='sqlite',
                             expire_after=CACHE_FILE_SECONDS)

mongodb = pymongo.MongoClient(os.environ["MONGODB_URI"])["bmj0hz1bfryjijw"]


@app.route('/')
def index():
    return render_template('index.html',
                           addons=dict(
                               map(lambda e: (e["artifactId"], e["version"]),
                                   get_valid_addons())))


@app.route('/custom')
def custom():
import sys

from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

import requests
import requests_cache

cachefile = "summaries"
requests_cache.install_cache(cachefile)

root_url = sys.argv[1]

nextpageurl = root_url

while nextpageurl is not None:

    r = requests.get(nextpageurl)
    
    soup = BeautifulSoup(r.text, 'html5lib')

    for link in soup.findAll('a'):
        thislink = link.get('href')
        thisurl = urljoin(root_url, thislink)
        print(thisurl)

    nextpageurl = None

    for item in soup.findAll('li', {'class': 'next'}):
    
Ejemplo n.º 54
0
import unittest
import os
import sys
import pprint
import lxml

import requests_cache

sys.path.insert(
    0, os.path.join(os.path.dirname(os.path.dirname(__file__)), "yyy"))
import scrapers

requests_cache.install_cache("tests_cached_requests",
                             expire_after=60 * 60 * 24)


class BaseTest(unittest.TestCase):
    pass


class TestScrapers(BaseTest):
    """Groups scrapes tests. The individual tests are added dynamically"""

    pass


for name, Scraper in scrapers.all_scrapers.items():

    def wrapper(name: str, Scraper: type):
        def test(self):
            scraper = Scraper()
Ejemplo n.º 55
0
def scrape_albums(genre_name, genre_id):
    req = requests.Session()
    requests_cache.install_cache('allmusic')
    headers = {
        'referer':
        'http://www.allmusic.com/advanced-search',
        'user-agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36'
    }
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap['phantomjs.page.settings.userAgent'] = (
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87'
    )
    #payload = {'filters[]': 'subgenreid:MA0000002451', 'sort': ''}
    #link = 'http://www.allmusic.com/advanced-search/results/{0}'

    albums = []
    albums_mood = []
    albums_url = []
    artists = []
    artists_url = []
    rating = []
    years = []
    item_ids = []
    page_no = 0
    album_num = 0
    print('Start Scraping {} ...'.format(genre_name))
    payload = {'filters[]': genre_id, 'sort': ''}
    link = 'http://www.allmusic.com/advanced-search/results/{0}'

    while True:
        print('page no', page_no)
        site = req.post(link.format(str(page_no) if page_no > 0 else ''),
                        data=payload,
                        headers=headers).text
        if 'desktop-results' not in site:
            print('nothing for page number', page_no)
            break
        if 'http://www.allmusic.com/album/' not in site:
            print('nothing for page number', page_no)
            break
        page_no += 1
        table = site.split('<tbody>')[1].split('/tbody>')[0]
        for row in tqdm(table.split('<tr>')[1:]):
            album = row.split('"title">', 1)[1].split('">',
                                                      1)[1].split('</a', 1)[0]
            albums.append(album)
            album_url = row.split('"title">', 1)[1].split(
                '">', 1)[0].split('<a ')[1].split('="', 1)[1]
            albums_url.append(album_url)
            while True:
                try:
                    client = webdriver.PhantomJS(desired_capabilities=dcap)
                    client.get(album_url)
                    page = client.page_source
                    client.quit()
                    break
                except:
                    print('Re-connect to {}'.format(album_url))
                    time.sleep(1)

            soup = bs(page, "lxml")
            # Moods
            moods = []
            try:
                for mood in soup.findAll('section',
                                         {"class": "moods"})[0].find_all('a'):
                    moods.append(mood.text)
            except:
                moods.append('None')
            albums_mood.append(moods)

            # Year
            try:
                year = row.split('class="year">')[1].split('</td',
                                                           1)[0].strip()
                years.append(year)
            except:
                print(album)
                years.append('None')

            # Artist
            try:
                artist = row.split('artist">')[1].split(
                    '</td', 1)[0].strip().split('">', 1)[1].split('</a', 1)[0]
                artists.append(artist)
            except:
                print(album, year)
                artists.append('Various Artists')

            # Artist URL
            try:
                artist_url = row.split('artist">')[1].split(
                    '</td',
                    1)[0].strip().split('">', 1)[0].split('<a ',
                                                          1)[1].split('="',
                                                                      1)[1]
                artists_url.append(artist_url)
            except:
                print(album, year)
                artists_url.append('None')

            time.sleep(1)
            album_num += 1

        print('Done')

    print('{0} albums under {1}'.format(album_num, genre_name))
    df = pd.DataFrame({
        'album': albums,
        'artist': artists,
        'year': years,
        'album_mood': albums_mood,
        'album_url': albums_url,
        'artist_url': artists_url
    })

    file_name = "_".join(genre_name.lower().split())
    df.to_csv('data/{}.csv'.format(file_name))
    print('Done. Saved to data/{}.csv'.format(file_name))
Ejemplo n.º 56
0
"""Functions for GitHub API requests."""

import getpass
import json
import os
import re
import sys

import requests

try:
    import requests_cache
except ImportError:
    print("no cache", file=sys.stderr)
else:
    requests_cache.install_cache("gh_api", expire_after=3600)

# Keyring stores passwords by a 'username', but we're not storing a username and
# password
fake_username = '******'


class Obj(dict):
    """Dictionary with attribute access to names."""
    def __getattr__(self, name):
        try:
            return self[name]
        except KeyError:
            raise AttributeError(name)

    def __setattr__(self, name, val):
Ejemplo n.º 57
0
#   alias btc="python $HOME/crypto-scripts/wci.py"
#
# Start using:
# $  source ~/.bashrc
# $  btc

import json
import requests
import requests_cache
import os
import sys
from si_prefix import si_format

os.chdir(os.path.dirname(__file__))

requests_cache.install_cache('test_cache', backend='sqlite', expire_after=5 * 60)


url = ('https://www.worldcoinindex.com/apiservice/getmarkets' +
       '?key=%(WORLD_COIN_INDEX_API_KEY)s&fiat=USD') % os.environ

max_items = int(os.environ.get('WORLD_COIN_INDEX_MAX_ITEMS', "40"))

ticker = requests.get(url).json()

if not 'Markets' in ticker:
  print ticker
  sys.exit(1)

data = ticker['Markets'][0]
data = sorted(data, key=lambda item: -item['Volume_24h'])
def cmd_crtsh(domain, no_cache, no_validate, verbose):
    """Downloads the certificate transparency logs for a domain
    and check with DNS queries if each subdomain exists.

    Uses multithreading to improve the performance of the DNS queries.

    Example:

    \b
    $ sudo habu.crtsh securetia.com
    [
        "karma.securetia.com.",
        "www.securetia.com."
    ]
    """

    if verbose:
        logging.basicConfig(level=logging.INFO, format='%(message)s')

    if not no_cache:
        homedir = Path(os.path.expanduser('~'))
        requests_cache.install_cache(str((homedir / '.habu_requests_cache')),
                                     expire_after=3600)

    subdomains = set()

    if verbose:
        print("Downloading subdomain list from https://crt.sh ...",
              file=sys.stderr)

    req = requests.get("https://crt.sh/?q=%.{d}&output=json".format(d=domain))

    if req.status_code != 200:
        print("[X] Information not available!")
        exit(1)

    json_data = json.loads(req.text)

    for data in json_data:
        name = data['name_value'].lower()
        if '*' not in name:
            subdomains.add(name)

    subdomains = list(subdomains)

    if no_validate:
        print(json.dumps(sorted(subdomains), indent=4))
        return True

    if verbose:
        print("Validating subdomains against DNS servers ...", file=sys.stderr)

    answers = query_bulk(subdomains)

    validated = []

    for answer in answers:
        if answer:
            validated.append(str(answer.qname))

    print(json.dumps(sorted(validated), indent=4))
    return True
Ejemplo n.º 59
0
import alexa
from bs4 import BeautifulSoup
import requests
import requests_cache
import json

# Using cache
requests_cache.install_cache('cache')

JS_STATS_FILE = 'js_stats.json'
X_XSS_STATS_FILE = 'x_xss_stats.json'
RANK_FILE = 'rank.json'
EXTERNAL_JAVASCRIPTS = [
    'jquery', 'react', 'bootstrap', 'angular', 'moment', 'socket.io', 'ember',
    'backbone', 'reveal', 'underscore', 'lodash', 'mocha', 'meteor', 'mercury',
    'dojo', 'ext-core', 'hammer', 'mootools', 'prototype', 'scriptaculous',
    'swfobject', 'three', 'webfont'
]


def parse_websites(websites):
    js_stats = {}
    x_xss_stats = {}

    for website in websites:
        print(website[1])
        try:
            response = requests.get('http://' + website[1])
            soup = BeautifulSoup(response.text, 'lxml')

            # Extracting script file sources
Ejemplo n.º 60
0
    def __init__(self,
                 headers=None,
                 cookies=None,
                 cache_name=None,
                 delay=1,
                 expire_hours=12,
                 as_string=False):
        '''
        Base class for common scraping tasks
        Args:
            headers: dict of headers
            cookies: cookiejar object
            cache_name: should be full path
            delay: int (be polite!!!)
            expire_hours: int - default 4
            as_string: get string rather than parsed json
        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())

        if not cookies:
            try:
                import cookielib
                cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError) as e:
                try:
                    import http.cookiejar
                    cookies = http.cookiejar.MozillaCookieJar()
                except Exception as e:
                    pass

        _s = requests.Session()
        _s.cookies = cookies

        if headers:
            _s.headers.update(headers)
        else:
            _s.headers.update({
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
            })

        if cache_name:
            if not '/' in cache_name:
                cache_name = os.path.join('/tmp', cache_name)
            try:
                from cachecontrol import CacheControlAdapter
                from cachecontrol.heuristics import ExpiresAfter
                from cachecontrol.caches import FileCache
                _s.mount(
                    'http://',
                    CacheControlAdapter(
                        cache=FileCache(cache_name),
                        cache_etags=False,
                        heuristic=ExpiresAfter(hours=expire_hours)))
            except ImportError as e:
                try:
                    import requests_cache
                    requests_cache.install_cache(cache_name)
                except:
                    pass

        self.s = _s
        self.urls = []
        self.as_string = as_string

        if delay > 0:
            self.delay = delay
        else:
            self.delay = None