def command(cls, config_ini, org_names): common.load_config(config_ini) common.register_translator() from ckan.plugins import toolkit from ckan import model orgs = [toolkit.get_action('organization_show')( data_dict={'id': org_name}) for org_name in org_names] source_org, dest_org = orgs assert source_org assert dest_org search_results = toolkit.get_action('package_search')( data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000)) print 'Datasets: %s' % search_results['count'] stats = Stats() if len(search_results['results']) != search_results['count']: assert 0, 'need to implement paging' #context = { # 'user': get_script_user(__name__)['name'], # 'ignore_auth': True, # 'model': model} rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ for dataset in search_results['results']: model.Package.get(dataset['id']).owner_org = dest_org['id'] #dataset_ = toolkit.get_action('package_patch')( # context=context, # data_dict=dict(id=dataset['id'], owner_org=dest_org['id'])) print stats.add('Changed owner_org', dataset['name']) print stats.report() print 'Writing' model.Session.commit()
def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = "script-fix_mandate.py" for package in model.Session.query(model.Package).filter(model.Package.state == "active"): if "mandate" in package.extras: mandate = package.extras.get("mandate") try: mandate = json.loads(mandate) if isinstance(mandate, list): stats.add("Already list", package.name) elif isinstance(mandate, basestring): stats.add("Fixing JSON string", package.name) package.extras["mandate"] = json.dumps([mandate]) else: stats.add("Problem JSON", package.name) except ValueError: if mandate != "": stats.add("Fixing string", package.name) package.extras["mandate"] = json.dumps([mandate]) else: stats.add("Deleting empty string", package.name) del package.extras["mandate"] else: stats.add("No mandate field", package.name) print stats.report() if write: print "Writing" model.Session.commit()
def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = 'fix_contact_details.py' for package in model.Session.query(model.Package).filter_by(state='active'): group = package.get_organization() if not group: stats.add('was not in a group', package.name) continue if package.extras.get('contact-name') == group.extras.get('contact-name'): if package_is_effected(package, group): if write: package.extras['contact-name'] = '' package.extras['contact-email'] = '' package.extras['contact-phone'] = '' package.extras['foi-name'] = '' package.extras['foi-email'] = '' package.extras['foi-web'] = '' package.extras['foi-phone'] = '' stats.add('resetting', 'Resetting package %s' % package.name) print stats.report() if write: model.Session.commit()
def command(cls, config_ini, write, options): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = 'script-delete_cache_filepath.py' process_all = True if options.resource: cls.process_resource(model.Resource.get(options.resource)) process_all = False else: # Get each dataset, counter = 0 datasets_q = model.Session.query(model.Package) \ .filter_by(state='active') rounded = int(math.ceil(datasets_q.count() / 100.0)) * 100 for x in xrange(0, rounded, 100): datasets = datasets_q.offset(x).limit(100) updated = False for dataset in datasets.all(): counter += 1 print "Processing dataset %d\r" % counter, for resource in dataset.resources: if cls.process_resource(resource): updated = True for key in dataset_properties_to_make_null: if getattr(dataset, key): stats_dp.add('Making property null: %s' % key, dataset.name) setattr(dataset, key, None) updated = True else: stats_dp.add('Property has no value: %s' % key, dataset.name) for key in dataset_extras_to_remove: if key in dataset.extras: #stats_de.add('Removing: %s' % key, dataset.name) del dataset.extras[key] updated = True else: stats_de.add('No field to remove: %s' % key, dataset.name) # We will be committing 100 at a time if updated and write: print "\nCommitting changes" import time s = time.time() model.Session.commit() print "Committed in ", time.time() - s print 'Resource Properties:\n', stats_rp.report(show_time_taken=False) print 'Resource Extras:\n', stats_re.report() print 'Dataset Properties:\n', stats_dp.report(show_time_taken=False) print 'Dataset Extras:\n', stats_de.report()
def get_datasets_from_ckan(domain): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://%s' % domain, apikey=apikey) datasets = ckan.action.package_search(q='organogram', rows=400) return datasets
def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() def new_revision(): rev = model.repo.new_revision() rev.author = 'script_delete_duplicate_datasets.py' if write: new_revision() publisher = model.Group.get(options.publisher) if publisher is None: print "Publisher could not be found" sys.exit(0) guids = defaultdict(list) for package in publisher.packages(): guids[package.extras.get('guid')].append(package) for guid, packages in guids.items(): if guid is None: for package in packages: stats.add('Skip package not harvested', package.name) continue if len(packages) == 1: stats.add('Skip guid without duplicates', guid) continue best_name = None for i, package in enumerate(sorted(packages, key=lambda x: x.metadata_modified, reverse=options.keep_last)): if (not best_name or len(package.name) < len(best_name) or (len(package.name) == len(best_name) and package.name < best_name)): best_name = package.name if i == 0: kept_package = package else: stats.add('Deleting', package.name) package.name = package.name + '_' package.state = 'deleted' # Write the name changes, so that we can reuse the best_name. stats.add('Keep', '%s->%s' % (kept_package.name, best_name)) if write: model.Session.commit() new_revision() kept_package.name = best_name if write: model.Session.commit() print stats.report()
def command(cls, config_ini): common.load_config(config_ini) common.register_translator() from ckanext.dgu.model.feedback import Feedback comment_hashes = [] headers = ["user_id", "package_id", "timestamp", "title", "comment"] writer = csv.DictWriter(sys.stdout, headers) for fb in model.Session.query(Feedback)\ .filter(Feedback.visible==True)\ .filter(Feedback.active==True)\ .order_by(Feedback.created): if not any([fb.economic, fb.social, fb.effective, fb.linked, fb.other]): stats.add('Missing any content', fb.id ) continue user = model.User.get(fb.user_id) pkg = model.Package.get(fb.package_id) data = { u"timestamp": fb.created.isoformat(), u"package": pkg.name, u"item": fb } content = render_template(TEMPLATE, data) comment = content.replace(u'\r',u'').replace(u'\n',u'').replace(u' ', u'') # Check for identical comments ... we want users duplicating comments on # the same package (by mistake most often). hashkey = u'{}.{}.{}'.format(comment, fb.package_id, fb.user_id).encode('utf8', 'ignore') comment_hash = hashlib.md5(hashkey).hexdigest() if comment_hash in comment_hashes: stats.add('Duplicate post', fb.id ) continue comment_hashes.append(comment_hash) row = { u"user_id": user.name[len("user_d"):], u"package_id": pkg.name, u"timestamp": fb.created.isoformat(), u"title": "Feedback on the value of this dataset ", u"comment": comment.encode('utf-8', 'ignore') } writer.writerow(row) stats.add('Processed', fb.id )
def test_init_missing_cert(): """ Try to initialize the context with a nonexistant cert. """ config = load_config() config['name'] = "failboat" config['sign_messages'] = True context = FedMsgContext(**config) context.publish(topic='awesome', msg=dict(foo='bar'))
def main(username, password): global config config = common.load_config() common.mkdirs(config["jive-base"]) os.chdir(config["jive-base"]) l = common.Lock(".lock") global max_index try: with open(".max-index") as f: max_index = int(f.read()) except IOError: pass tls.s = requests.Session() login(username, password) threads = int(config["jive-threads"]) for i in range(threads): t = threading.Thread(target = worker, name = i, args = (tls.s.cookies, )) t.daemon = True t.start() for c in contents(): q.put((iter_content, c)) q.join() cleanup() common.write_sync_done() global index with open(".max-index", "w") as f: print(index, file = f)
def install(): config = load_config() try: run('ls blink') except: run('git clone ' + config.get('blink', 'repository')) configure()
def __init__(self, configfile): """ Following class attributes initialized in __init__: nova - client for nova service keystone - client for keystone service ceilometer - client for ceilometer service hosts - all hosts for region servers - all servers in region """ self.logger = logging.getLogger(__name__) self.config = load_config(configfile) if self.config: try: auth = v3.Password(username=self.config['username'], password=self.config['password'], project_name=self.config['tenant_name'], auth_url=self.config['auth_url'], user_domain_name=self.config['domain'], project_domain_name=self.config['domain'],) self.session = session.Session(auth=auth, verify=self.config['cacert']) self.nova = client_nova.Client(2, session=self.session) self.keystone = client_keystone.Client(session=self.session) self.ceilometer = client_ceilometer.Client(2, session=self.session) self.glance = client_glance.Client(2, session=self.session) self.hosts = self.nova.hosts.list() self.servers = self.nova.servers.list(search_opts = { 'all_tenants': 1 }) self.projects = self.getprojects() self.flavors = self.getflavors() self.connected = True except: self.logger.error('Error for authentication with credentials from ' + configfile) self.connected = False
def main(): global config config = common.load_config() common.mkdirs(config["product-docs-base"]) os.chdir(config["product-docs-base"]) l = common.Lock(".lock") get_dump() valid_files = set([".lock", ".sync-done"]) pool = multiprocessing.Pool(processes=int(config["product-docs-threads"])) for x in iter_dump(): x["product_"] = x["product"].replace("_", " ") url = "https://access.redhat.com/documentation/%(language)s/" \ "%(product)s/%(version)s/pdf/%(name)s/" \ "%(product)s-%(version)s-%(name)s-%(language)s.pdf" % x f = "%(product_)s/%(version)s/" \ "%(product)s-%(version)s-%(name)s-%(language)s.pdf" % x pool.apply_async(download, (url, f)) valid_files.add(f) pool.close() pool.join() remove_invalid_files(valid_files) common.write_sync_done()
def main(): global config config = common.load_config() # Permit write of UTF-8 characters to stderr (required when piping output) if sys.stderr.encoding == None: sys.stderr = codecs.getwriter("UTF-8")(sys.stderr) common.mkdirs(config["pt-base"]) os.chdir(config["pt-base"]) _lock = common.Lock(".lock") tls.s = requests.Session() login() for i in range(int(config["pt-threads"])): t = threading.Thread(target = worker, name = i, args = [tls.s.cookies]) t.daemon = True t.start() read_project_list() while q.unfinished_tasks: time.sleep(1) q.join() cleanup() common.write_sync_done()
def xmpp_test(): global conf if not conf: conf = load_config() if not push_in_queue(construct_message(conf['xmpp_recipients'], 'test message for: %s' % conf['xmpp_recipients'])): return '', 500 return '', 204
def setUp(self): self.config = load_config() self.config['name'] = local_name self.config['mute'] = True self.config['persistent_store'] = Mock() self.replay_context = ReplayContext(**self.config) self.replay_thread = ReplayThread(self.replay_context) self.context = zmq.Context()
def prometheus_alert(): global conf if not conf: conf = load_config() msg = PrometheusAlert(request.data.decode()).plain() html = PrometheusAlert(request.data.decode()).html() push_in_queue(construct_message(conf['xmpp_recipients'], msg, html)) return '', 204
def setUp(self): self.config = load_config() self.config['name'] = local_name self.config['persistent_store'] = Mock() self.replay_context = ReplayContext(**self.config) self.request_context = zmq.Context() self.request_socket = self.request_context.socket(zmq.REQ) self.request_socket.connect( self.config['replay_endpoints'][local_name])
def setUp(self): config = load_config() self.hub = CentralMokshaHub(config=config) self.context = FedMsgContext(**config) # fully qualified self.fq_topic = "com.test_prefix.dev.unittest.foo" # short version self.topic = "foo"
def add_instance(count): conn = connect() config = load_config() ami = config.get('aws', 'ami') spot_price = config.getfloat('aws', 'spot_price') key_name = config.get('aws', 'key_name') instance_type = config.get('aws', 'instance_type') availability_zone_group = config.get('aws', 'availability_zone_group') #placement = config.get('aws', 'placement') security_group = config.get('aws', 'security_group') create_ondemand_instances(conn, ami, security_group, instance_type, count, key_name)
def test_init_invalid_endpoint(): try: config = load_config() config['name'] = local_name config['persistent_store'] = Mock() tmp = zmq.Context() placeholder = tmp.socket(zmq.REP) placeholder.bind('tcp://*:{0}'.format( config["replay_endpoints"][local_name].rsplit(':')[-1] )) context = ReplayContext(**config) finally: placeholder.close()
def connect(): config = load_config() #ConfigParser.ConfigParser() aws_key = config.get('aws', 'aws_key') aws_secret = config.get('aws', 'aws_secret') conn = boto.ec2.connect_to_region( 'us-east-1', aws_access_key_id = aws_key, aws_secret_access_key = aws_secret, ) return conn
def init(): global config config = common.load_config() if config["thunderbird-base"] is None: return if isrunning(): print >>sys.stderr, "thunderbird.py: thunderbird is running, disabling plugin" config["thunderbird-base"] = None return rmpath(config["thunderbird-folder"]) mkpath(config["thunderbird-folder"]) config["thunderbird-base"] = base(spd(config["thunderbird-folder"]))
def main(): # fetch CLI arguments ( dry_run, filter_include_list, filter_exclude_list ) = common.read_arguments() # load config from file config_data = common.load_config() config_auth_token = config_data['AUTH_TOKEN'] # fetch repository list and wiki status of the specified repository type print('Building repository list:') all_repository_set = get_repository_name_wiki_status_set( config_auth_token, config_data['REPOSITORY_TYPE'], common.RepositoryFilter(filter_include_list,filter_exclude_list) ) # get total count, if zero then no work repository_count = len(all_repository_set) if (repository_count < 1): print('\nNo repositories for processing') return print('\nTotal repositories: {0}'.format(repository_count)) # determine wiki enabled count wiki_enabled_repository_set = filter_repository_wiki_enabled(all_repository_set) wiki_enabled_count = len(wiki_enabled_repository_set) if (wiki_enabled_count < 1): # no projects enabled - no work print('All wikis disabled') return print('Wikis enabled: {0}'.format(wiki_enabled_count)) # disable wikis (only simulation if dry run mode) print('\n\nDisabling wikis{0}:'.format(' [DRY RUN]' if (dry_run) else '')) for repository_name in wiki_enabled_repository_set: if (not dry_run): disable_repository_wiki(config_auth_token,repository_name) print(repository_name)
def main(): # load config from file config_data = common.load_config( config_key_addition_set = { ORGANIZATION_CONFIG_KEY } ) config_auth_token = config_data['AUTH_TOKEN'] # fetch repository names/sizes of the specified type print('Building repository list ordered by size:') repository_list = get_organization_repository_size_sorted_list( config_auth_token, config_data[ORGANIZATION_CONFIG_KEY], config_data['REPOSITORY_TYPE'] ) # output list, repository URI/size - tab separated for repository_uri,repository_size in repository_list: print('{0}\t{1}'.format(repository_uri,repository_size))
def main(): conf = load_config() bot = Bot(conf['xmpp_jid'], conf['xmpp_password']) if not timeout(bot.start, [conf['xmpp_host'], conf['xmpp_port']]): terminate() receiver = IPCReceiver(conf['mq_name']) try: while True: data = json.loads(receiver.receive()) if 'html' in data: bot.send_message_to(data['message'], data['recipients'], html=data['html']) else: bot.send_message_to(data['message'], data['recipients']) except KeyboardInterrupt: pass except ipc.SignalError: pass finally: receiver.cleanup() terminate()
def main(username, password): global config config = common.load_config() common.mkdirs(config["jive-base"]) os.chdir(config["jive-base"]) l = common.Lock(".lock") tls.s = requests.Session() login(username, password) threads = int(config["jive-threads"]) for i in range(threads): t = threading.Thread(target = worker, name = i, args = (tls.s.cookies, )) t.daemon = True t.start() for c in contents(): q.put((iter_content, c)) q.join() cleanup() common.write_sync_done()
def main(): config = load_config() # TODO: break down into more functions? boto_session = get_boto_session(config) # TODO: if s3_resources continue to be used only to get bucket, merge get_s3_resource into get_s3_bucket. but wait # until after any restructuring as class. s3_resource = get_s3_resource(boto_session) s3_bucket = get_s3_bucket(s3_resource, config) # get list of files in relevant dir of bucket # by converting to list we should trigger just one call to S3 API, unlike iterating over collection # see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/collections.html#when-collections-make-requests objs = list( s3_bucket.objects.filter( Prefix=config['AWS']['s3_trips_prefix']).all()) # use a pandas.DataFrame for convenient storage of metadata about the trip data csv files # collect metadata as (vertical) list of (horizontal) lists (rows), then construct DataFrame, for efficiency # (see "Notes" on # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html#pandas-dataframe-append) # parse filename into year, month, and trip type # unfortunately, the DataFrame constructor only accepts a singleton dtype argument and the default inference doesn't # preserve smaller numpy data types in the input. any improvement in the efficiency of the filtration and sorting # below probably not worth the cost of using astype() to force the series to convert after creation given relatively # small number of rows. trip_files = [parse_key(obj.key) for obj in objs] trip_files = DataFrame(trip_files, columns=[col[0] for col in TRIPFILE_METADATA_COLS]) # exclude undesired trip files now, using labeled columns, rather than the less convenient file names # for now, limit to those with usable lat and long columns: green and yellow, through the first half of 2016 undesired_indices = trip_files[ ~trip_files['Type'].isin(['green', 'yellow']) | (trip_files['Year'] > 2016) | ((trip_files['Year'] == 2016) & (trip_files['Month'] > 6))].index trip_files.drop(undesired_indices, axis=0, inplace=True) # assuming pandas isn't optimized enough under the hood that sorting first would speed the lookups involved in the # drops, more efficient to reduce dataset size before sorting trip_files.sort_values(['Year', 'Month'], axis=0, ascending=True, inplace=True) if TESTING and TESTING_CSV_LIMIT is not None: trip_files = [trip_files.head(min(len(trip_files), TESTING_CSV_LIMIT))] # separate trip_file entries into separate pandas DataFrames for each time period (year, for now) and store them # chronologically in time_period_tables list # note that groupby guarantees the sort order done above will be preserved within each group — see # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html#pandas-dataframe-groupby year_groups = trip_files.groupby(['Year']) time_period_tables = [ year_groups.get_group(year) for year in year_groups.groups ] # simulate the time periods in sequence [ simulate_time_period(time_period_table, config) for time_period_table in time_period_tables ]
#!/usr/bin/env python3 import argparse from pprint import pprint from common import load_config from common import encode from common import send_msg def opt_parser(): parser = argparse.ArgumentParser( description='Network reconfiguration node') parser.add_argument('--net_config', default='config/sample_graph3.json', type=str) return parser if __name__ == '__main__': parser = opt_parser() opt = parser.parse_args() config = load_config(opt.net_config) pprint(config) start_msg = encode(dict(type="start")) for node in config.values(): send_msg(start_msg, host=node['host'], port=node['port']) print(node)
base_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.join(base_dir, 'Common') sys.path.insert(0, script_dir) import common from polyglotdb.client.client import PGDBClient token = common.load_token() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('corpus_name', help='Name of the corpus') args = parser.parse_args() corpus_name = args.corpus_name directories = [ x for x in os.listdir(base_dir) if os.path.isdir(x) and x != 'Common' ] if args.corpus_name not in directories: print( 'The corpus {0} does not have a directory (available: {1}). Please make it with a {0}.yaml file inside.' .format(args.corpus_name, ', '.join(directories))) sys.exit(1) corpus_conf = common.load_config(corpus_name) print('Processing...') client = PGDBClient('http://localhost:{}'.format(8080), token=token) client.delete_database(corpus_name)
import os import re import glob from openpyxl import Workbook from formula.parser import FormulaParser from settings import BASE_DIR from common import load_config CONFIG = load_config() WORK_ORDERS_PATH = os.path.join(BASE_DIR, CONFIG.get('default', 'formula_dir')) def extract_viscosity(): """提取粘度数据""" all_formula_files = glob.glob(f"{WORK_ORDERS_PATH}/**/*.xlsx", recursive=True) result = [] for filepath in all_formula_files: parser = FormulaParser(filepath) formulas = parser.parse() for formula in formulas: after_adding_requirement = formula['metas'][ 'after_adding_requirement'] if isinstance(after_adding_requirement, list): for requirement in after_adding_requirement: if requirement.find("粘度要求") >= 0: result.append( dict(name=formula['name'], viscosity=requirement)) elif after_adding_requirement and isinstance(
self.signal_hup = 0 ts_msg = TsMessage(_ts, _msg) ts_msg.print_raw() self.msgs_curr_total = self.msgs_curr_total + 1 def handler_atexit(): log("Terminated") if __name__ == "__main__": atexit.register(handler_atexit) log("Running: Pid {:5d}".format(os.getpid())) params = {} load_config(params, "config/config.txt") host = params['addr'] port = params['port'] type = params['type'] print(params['vers'], params['name']) client = MyClient(_host=host, _port=port, _datatype=type) client.run() sys.exit(0)
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100, escape_q=False) dataset_names.extend( [dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add( 'Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date( resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add( 'Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [ res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation' ] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % ( i + 1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % ( field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % ( res['dataset_name'], res['id']) date_format = { 'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY' } input_ = raw_input( 'Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print[res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace() # Remove 'dataset_name' and others fields from resources ignore_res_fields = set(('dataset_name', 'created', 'position', 'revision_id', 'id', 'tracking_summary')) for res in resources: for field in ignore_res_fields & set(res.keys()): del res[field] # Merge dataset fields def get_all_fields_and_values(datasets): ignore_fields = set(( 'id', 'resources', 'last_major_modification', 'data_dict', 'revision_timestamp', 'num_tags', 'metadata_created', 'metadata_modified', 'odi_certificate', 'extras', # they are at top level already 'timeseries_resources', 'individual_resources', 'additional_resources', 'revision_id', 'organization', 'tracking_summary', 'num_resources', 'license_title', 'author', 'author_email', 'maintainer', 'maintainer_email', 'temporal_granularity', 'geographic_granularity', 'state', 'isopen', 'url', 'date_update_future', 'date_updated', 'date_released', 'precision', 'taxonomy_url', 'temporal_coverage-from', 'temporal_coverage-to', 'published_via', 'creator_user_id', )) first_fields = [ 'title', 'name', 'notes', 'theme-primary', 'theme-secondary' ] all_field_values = defaultdict(list) for dataset in datasets: for field in dataset: if field not in ignore_fields and dataset[field]: all_field_values[field].append(dataset[field]) for field in first_fields: yield field, all_field_values.get(field, []) for field in all_field_values: if field not in first_fields: yield field, all_field_values[field] spend_data_defaults = { 'geographic_coverage': None, 'theme-primary': 'Government Spending', 'theme-secondary': None, 'update_frequency': 'monthly', } combined_dataset = {'resources': resources} all_fields_and_values = get_all_fields_and_values(datasets) for field, values in all_fields_and_values: if field == 'notes': values = [value.strip() for value in values] if field == 'tags': # just merge them up-front and # dont offer user any choice tags_by_name = {} for dataset_tags in values: for tag in dataset_tags: if tag['name'] not in tags_by_name: tags_by_name[tag['name']] = tag values = [tags_by_name.values()] if field in ('codelist', 'schema'): # just merge them up-front # And convert the dict into just an id string ids = set() for dataset_values in values: for value_dict in dataset_values: ids.add(value_dict['id']) values = [list(ids)] print '\n%s:' % field pprint(list(enumerate(values))) if options.spend and field in spend_data_defaults: value = spend_data_defaults[field] print 'Spend data defaults to: %s' % value values = [value] if value is not None else None # dont be case-sensitive for boolean fields if field == 'core-dataset': values = [v.lower() for v in values] try: values_identicle = len(set(values)) == 1 except TypeError: if values and len(values): val1 = values[0] for val in values[1:]: if val != val1: values_identicle = False break else: values_identicle = True if (not values) or (not len(values)): pass elif values_identicle: value = values[0] elif field == 'name': while True: from ckan.lib.munge import munge_title_to_name munged_title = munge_title_to_name( combined_dataset['title']) print munge_title_to_name( datasets[0]['organization']['title']) value = raw_input('Type new value (%s): ' % (munged_title)) if not value: value = munged_title if len(value) < 3: print 'Too short' continue if value in values: print 'That name is taken' continue existing = ckan.action.package_autocomplete(q=value) if value in existing: print 'That name is taken on CKAN' continue break else: while True: response = raw_input( '%s: value (number) or type new one: ' % field) try: value_index = int(response) value = values[value_index] print value except ValueError: # fix pound signs if the user pasted from the repr'd version response = re.sub(r'\\xa3', u'\xa3', response) value = response if not value and field in ('title', 'owner_org', 'notes', 'license_id'): print 'You must have a value for this field!' continue break if value: combined_dataset[field] = value # Store print '\nMerged dataset:\n' pprint(combined_dataset) response = raw_input( 'Press enter to write or pdb to edit in pdb first: ') if response == 'pdb': import pdb pdb.set_trace() try: if options.update: ckan.action.dataset_update(**combined_dataset) else: ckan.action.dataset_create(**combined_dataset) except Exception, e: print e import pdb pdb.set_trace()
def test_init_missing_endpoint(): """ Try to initialize the context with a nonexistant service name. """ config = load_config() config['name'] = "failboat" config['sign_messages'] = True context = FedMsgContext(**config)
ap.add_argument("list", nargs="?") return vars(ap.parse_args()) def isgzip(f): bytes = f.read(2) f.seek(0) return bytes == b"\x1F\x8B" if __name__ == "__main__": warnings = 0 global config config = common.load_config() args = parse_args() if args["quiet"]: common.progress = lambda x, y: None common.progress_finish = lambda: None if not config["lists-sync"]: print("Please configure lists in $HOME/.satools before running %s." % sys.argv[0], file=sys.stderr) sys.exit(1) common.mkdirs(config["lists-base"]) os.chdir(config["lists-base"])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--no-download', action='store_true') parser.add_argument('--no-remux', action='store_true') parser.add_argument('--no-upload', action='store_true') parser.add_argument('--no-notify', action='store_true') parser.add_argument('--no-delete', action='store_true') parser.add_argument('--force', action='store_true') parser.add_argument('--force-log-to-file', action='store_true') parser.add_argument('--override-channel-name') parser.add_argument('--override-video-name') parser.add_argument('video_id') args = parser.parse_args() config = load_config() log_filename = LOGDIR / f'{args.video_id}-{os.getpid()}.log' if not sys.stdout.isatty( ) or args.force_log_to_file else None # For subprocess log_file = log_filename.open('a') if log_filename else None if log_file: sys.stderr = sys.stdout = log_file setup_logging(filename=log_filename) log.info(f'Starting download for {args.video_id}') pid_exists, active_downloaders = check_pid(args.video_id) if pid_exists and not args.force: raise ValueError('Another downloader is still alive, exiting') else: active_downloaders[args.video_id] = os.getpid() with open_state() as state: state['active_downloaders'] = active_downloaders if args.override_channel_name and args.override_video_name: log.info( 'Using overridden channel and video name, setting is_upcoming to false' ) channel_name = args.override_channel_name video_name = args.override_video_name # There's no reason to use these overrides for an upcoming video is_upcoming = False else: player_response = get_video_info(args.video_id) if 'videoDetails' not in player_response: log.error(f'{args.video_id} has no details, cannot proceed ' '(playability: {}, {})'.format( player_response["playabilityStatus"]["status"], player_response["playabilityStatus"]["reason"], )) sys.exit(1) else: channel_name = player_response['videoDetails']['author'] video_name = player_response['videoDetails']['title'] is_upcoming = player_response['videoDetails'].get( 'isUpcoming', False) log.info(f'Channel: {channel_name}') log.info(f'Title: {video_name}') log.info(f'Upcoming: {is_upcoming}') if is_upcoming: wait(player_response, config) filename_base = sanitize_filename(video_name) log.info(f'Filename base: {filename_base}') # Copy youtube-dl's naming scheme filepath_streamlink = WORKDIR / f'{filename_base}-{args.video_id}.ts' # TODO: If file already exists, rename it and concatenate it later? # XXX: youtube-dl used to be less reliable than streamlink for downloading # streams - that may no longer be the case. # XXX: Invoke this in a less hacky manner # The reason for doing this is that I wanted to use streamlink # inside the venv but in a separate process, # without hardcoding the path of the venv. streamlink_args = [ '--force', # Overwrite any existing file '--hls-timeout', '60', # XXX: This doesn't work right now! # See https://github.com/streamlink/streamlink/issues/2936 '--hls-live-restart', '--retry-streams', '10', '--retry-max', '10', '-o', str(filepath_streamlink), f'https://www.youtube.com/watch?v={args.video_id}', 'best', ] if not args.no_download: log.info(f'Starting streamlink with args: {streamlink_args}') fork_return = os.fork() if fork_return == 0: sys.argv = streamlink_args streamlink_main() else: os.wait() else: log.info('Skipping download') filename_output = f'{filename_base}-{args.video_id}.mp4' filepath_output = WORKDIR / filename_output ffmpeg_args = ( 'ffmpeg', '-y', '-i', filepath_streamlink, '-c', 'copy', '-movflags', 'faststart', '-metadata', f'title={video_name}', '-metadata', f'artist={channel_name}', '-metadata', f'comment=https://www.youtube.com/watch?v={args.video_id}', filepath_output, ) if not args.no_remux: log.info('Remuxing to mp4') subprocess.run(ffmpeg_args, stdout=log_file) else: log.info('Skipping remux') # Upload if not args.no_upload: link_url, thumbnail = upload( sanitize_filename(channel_name), # This argument duplication is kind of silly... filename_output, filepath_output, ) # We won't have link and thumb if not uploading without # going through a bunch more effort. if not args.no_notify: notify( channel_name, video_name, link_url, thumbnail, ) else: log.info('Skipping notify') else: log.info('Skipping upload') if not args.no_delete: log.info('Deleting work files') filepath_streamlink.unlink() filepath_output.unlink() log.info('Cleaning up state') with open_state() as state: active_downloaders = state.get('active_downloaders', {}) active_downloaders.pop(args.video_id, None) else: log.info('Skipping cleanup') log.info('All done!')
import os from os import environ from common import load_config load_config(environ.get('PYTHONPATH', '')) class Config: SECRET_KEY = os.urandom(24) SESSION_COOKIE_NAME = environ.get('SESSION_COOKIE_NAME') SESSION_COOKIE_SECURE = True SQLALCHEMY_DATABASE_URI = environ.get('SQLALCHEMY_DATABASE_URI') SQLALCHEMY_ECHO = False SQLALCHEMY_TRACK_MODIFICATIONS = False class ProdConfig(Config): FLASK_ENV = 'production' DEBUG = False TESTING = False class DevConfig(Config): FLASK_ENV = 'development' DEBUG = True TESTING = True
def fixup_versions(config: dict = None): if config is None: config = common.load_config() import subprocess import glob import mmap import sys root = common.root_dir() script_dir = pathlib.Path(__file__).absolute().parent res_hacker = glob.glob(f"{root}/packages/**/ResourceHacker.exe", recursive=True)[0] data_dir = common.unity_data_dir(config["unityBuildDir"]) if data_dir is None: print("No unity build dir found", file=sys.stderr) return plugins_dir = data_dir / "Managed" with open(script_dir / "version.rc.in", "r", newline="") as file: res = file.read() version_rc = script_dir / "version.rc" version_res = version_rc.with_suffix(".res") version = "1.0.0.0" for plugin_dll in config["invalidFileVersion"]: plugin = plugins_dir / plugin_dll with open(version_rc, "w") as file: file.write(res.format(name=plugin_dll, version=version)) subprocess.check_call([ res_hacker, "-open", version_rc, "-save", version_res, "-action", "compile", "-log", "CON", ]) # damn ResourceHacker puts invalid length bytes as far .NET is concerned, # doesn't seem to affect any other tools though fileversion_bytes = bytearray.fromhex( "46 00 69 00 6c 00 65 00 56 00 65 00 72 00 73 00 69 00 6F 00 6E 00" ) with open(version_res, "r+b") as f: with mmap.mmap(f.fileno(), 0) as mm: index = -1 while True: index = mm.find(fileversion_bytes, index + 1) if index == -1: break # +1 for the null character I guess len_str = str(hex(len(version) + 1))[2:] if len(len_str) == 1: len_str = "0" + len_str assert len(len_str) == 2 mm[index - 4] = bytearray.fromhex(len_str)[0] subprocess.check_call([ res_hacker, "-open", plugin, "-save", plugin, "-action", "addskip", "-res", version_res, "-log", "CON", ]) print(f"{plugin}: fixed version resource")
def main(): global conf conf = load_config() service.run(host=conf['flask_host'], port=conf['flask_port'])
def __init__(self): self.params = {} load_config(self.params, "config/config.txt") return
def get_cache_status(archival): if not archival.cache_filepath: return 'Not cached' if os.path.exists(archival.cache_filepath): return 'Cached' return 'Cache missing on disk!' if __name__ == '__main__': usage = __doc__ + """ usage: %prog [-w] <ckan.ini> """ parser = OptionParser(usage=usage) parser.add_option("-w", "--write", action="store_true", dest="write", help="write the theme to the datasets") parser.add_option('-d', '--dataset', dest='dataset') parser.add_option('-r', '--resource', dest='resource') parser.add_option('-o', '--organization', dest='organization') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Wrong number of arguments (%i)' % len(args)) config_filepath = args[0] print 'Loading CKAN config...' common.load_config(config_filepath) common.register_translator() print 'Done' fix_links(options)
if __name__ == '__main__': USAGE = '''Daily script for government Usage: python %s <config.ini> [task] Where: [task] - task to run (optional), picked from: %s or run multiple by separating by a comma. ''' % (sys.argv[0], ','.join(TASKS_TO_RUN)) if set(sys.argv) & set(('--help', '-h')): print USAGE sys.exit(1) if len(sys.argv) < 2: err = 'Error: Please specify config file.' print USAGE, err logging.error('%s' % err) sys.exit(1) config_file = sys.argv[1] config_ini_filepath = os.path.abspath(config_file) if len(sys.argv) == 3: TASKS_TO_RUN = sys.argv[2].split(',') load_config(config_ini_filepath) register_translator() logging.config.fileConfig(config_ini_filepath) command(config_file)
if admin_count: print " -> ", print ', '.join( u.name for u in group.members_of_type(model.User, 'admin').all()) editor_count = group.members_of_type(model.User, 'editor').count() print "Editors: {uc}".format(uc=editor_count) if editor_count: print " -> ", print ', '.join( u.name for u in group.members_of_type(model.User, 'editor').all()) print "Dataset count: {dc}".format( dc=group.members_of_type(model.Package).count()) if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('config', help='CKAN config .ini filepath') parser.add_argument('old_name', metavar='old_name') parser.add_argument('new_name', metavar='new-name') parser.add_argument('-t', '--title', help='Title to set') args = parser.parse_args() common.load_config(args.config) common.register_translator() PublisherRenamer().rename(args.old_name, args.new_name, args.title)
def start(demo=False): if demo: # load demo options. it will escape config file. cookie_secret = common.hmacstr(common.randomstr(), common.randomstr()) args = [ sys.argv[0], "--debug", "--host=0.0.0.0", "--port=8080", "--base_url=/ssweb", "--service_name=shadowsocks", "--cookie_secret=" + cookie_secret, "--logging=debug" ] options.parse_command_line(args) else: # pre-parse the command line options. it will be over write by 'load # options from config file'. by then, it yet loaded. options.parse_command_line() if options.config is not None: # load options from specified config file if not os.path.isfile(options.config): err_("Can't find config file '%s'." % options.config) exit(1) else: config = common.load_config(options.config) if config is not None: info_("Load config from file '%s'." % options.config) args = [sys.argv[0]] for item in config: args += ["--%s=%s" % (item, config[item])] try: options.parse_command_line(args) except tornado.options.Error: err_("Error on config file option.") sys.exit(1) else: # load options from config file, if the file exists. config_file = common.find_config_file() if config_file is not None: config = common.load_config(config_file) if config is not None: info_("Load config from file '%s'." % config_file) args = [sys.argv[0]] for item in config: args += ["--%s=%s" % (item, config[item])] try: options.parse_command_line(args) except tornado.options.Error: err_("Error on config file option.") sys.exit(1) # load options from command line try: options.parse_command_line() except tornado.options.Error: err_("Error on command line option.") sys.exit(1) debug_("options: %s" % json.dumps(options.as_dict(), sort_keys=True)) logging.debug("options: %s" % json.dumps(options.as_dict(), sort_keys=True)) # load shadowsocks configuration ss_config_filename = common.find_shadowsocks_config_file() if ss_config_filename is None: err_("Can't find any shadowsocks config file. Are you sure there " "installed shadowsocks already?") exit(1) config = common.load_shadowsocks_config(ss_config_filename) info_("Loading shadowsocks config from file '%s'." % ss_config_filename) start_tornado(config, ss_config_filename)
global_log.warn(msg, *params) def usage(): print """ Imports publishers from the specified CSV file. Usage: python publisher_categories.py <CKAN config ini filepath> export pub_cats.csv - produces a list of publishers and their categories python publisher_categories.py <CKAN config ini filepath> import pub_cats.csv - import an amended list of publishers and their categories """ if __name__ == '__main__': if len(sys.argv) != 4: print 'Wrong number of arguments %i' % len(sys.argv) usage() sys.exit(0) cmd, config_ini, action, filepath = sys.argv common.load_config(config_ini) PublisherCategories.setup_logging(config_ini) common.register_translator() if action == 'export': PublisherCategories.export(filepath) elif action == 'import': PublisherCategories.import_(filepath) else: raise NotImplementedError
def transfer(read_from, save_to): click.echo('%s --> %s' % (read_from, save_to)) if read_from not in OPTIONS or save_to not in OPTIONS: print 'Should be %s or %s' % (LOCAL, FIREBASE) sys.exit(-1) if read_from == save_to: print 'Saving data to where it is from does not make sense.' sys.exit(-2) click.echo('This will OVERWRITE data in "%s". Are you sure? [y/N]' % save_to) confirm = sys.stdin.readline() if confirm.strip() != 'y': print 'byebye~' return common.READ_FROM = common.LOCAL if read_from == LOCAL else common.FIREBASE common.SAVE_TO = (common.LOCAL,)\ if save_to == LOCAL else (common.FIREBASE,) print 'Transfering catalog...' catalog = common.load_catalog() common.save_catalog(catalog) print 'Transfering categories...' catalog = common.load_catalog() categories = common.load_categories() common.save_categories(categories) print 'Transfering filter results...' f_results = common.load_filter_results() common.save_filter_results(f_results) print 'Transfering indicator results...' i_results = common.load_indicator_results() common.save_indicator_results(i_results) print 'Transfering config...' config = common.load_config() common.save_config(config) todo = [] for stocks in catalog.values(): todo.extend(stocks) total = len(todo) print 'Transfering sotcks...' widgets = [ FormatLabel( 'Processed: %(value)d / {0} (in: %(elapsed)s)'.format(total)) ] pbar = ProgressBar(widgets=widgets, maxval=total) count = 0 pbar.start() for s in todo: data = common.load_stock(s) common.save_stock(s, data) pbar.update(count) count += 1 pbar.finish() print 'Transfering state...' catalog = common.load_catalog() state = common.load_state() common.save_state(state)
#!/usr/bin/python import sys import logging import time import common import message_consumer logger = logging.getLogger(__name__) if __name__ == '__main__': logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger("pika").setLevel(logging.ERROR) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) common.load_config() while True: try: message_consumer.initiate_consumer() except Exception as e: logger.error(e) time.sleep(60)
def command(cls, config_ini, options): common.load_config(config_ini) common.register_translator() from ckan import model from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES) rev = model.repo.new_revision() rev.author = 'script-fix_themes.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset, organization_ref=options.organization) def fix_theme(theme_str): '''Returns (fixed_theme_str, outcome)''' if not theme_str: return '', 'Blank' elif theme_str == 'null': return '', '"null"->""' elif theme_str in THEMES: return theme_str, 'Ok' else: fixed_theme = THEME_MAP.get(theme_str) if fixed_theme is None: return theme_str, 'Unknown theme %s - recategorizing' % theme_str else: assert (fixed_theme != theme_str) return fixed_theme, 'Changed to long form' package.extras[PRIMARY_THEME] = new_primary def recategorize(pkg): themes = categorize_package(pkg, stats_recategorize) print 'Recategorize: %s' % themes if themes: pkg.extras[PRIMARY_THEME] = themes[0] elif PRIMARY_THEME in pkg.extras: pkg.extras[PRIMARY_THEME] = '' if len(themes) > 1: pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1] elif SECONDARY_THEMES in pkg.extras: pkg.extras[SECONDARY_THEMES] = '[]' for package in datasets: if PRIMARY_THEME in package.extras: primary = package.extras.get(PRIMARY_THEME) new_primary, outcome = fix_theme(primary) if new_primary != primary: package.extras[PRIMARY_THEME] = new_primary output = stats_primary.add(outcome, package.name) if outcome != 'Ok': print output if outcome.startswith('Unknown theme'): recategorize(package) continue else: stats_primary.add('No theme', package.name) if SECONDARY_THEMES in package.extras: secondary = package.extras.get(SECONDARY_THEMES) try: secondary = json.loads(secondary) except ValueError: if secondary.startswith('{') and secondary.endswith('}'): # '{Crime}' -> 'Crime' secondary = secondary[1:-1].strip('\"') print stats_secondary.add('Tidied {}', package.name) else: print stats_secondary.add('Error decoding JSON', package.name) if secondary == {}: secondary = [] new_secondary = [] do_recategorize = False if not isinstance(secondary, list): secondary = [secondary] for theme_str in secondary: if not isinstance(theme_str, basestring): print stats_secondary.add( 'Not a list of strings %s' % type(theme_str), package.name) continue new_theme, outcome = fix_theme(theme_str) if new_theme: new_secondary.append(new_theme) if outcome != 'Ok': print stats_secondary.add(outcome, package.name) if outcome.startswith('Unknown theme'): do_recategorize = True if do_recategorize: recategorize(package) continue if json.dumps(new_secondary) != package.extras.get( SECONDARY_THEMES): stats_secondary.add('Fixed', package.name) package.extras[SECONDARY_THEMES] = json.dumps( new_secondary) else: stats_secondary.add('Ok', package.name) else: stats_secondary.add('No theme', package.name) if 'themes-secondary' in package.extras: print stats_secondary.add( 'Old key removed: themes-secondary', '%s %s' % (package.name, package.extras['themes-secondary'])) del package.extras['themes-secondary'] print "\nPrimary theme:" print stats_primary.report() print "\nSecondary theme:" print stats_secondary.report() print "\nRecategorizations:" print stats_recategorize.report() if options.write: print 'Writing' model.Session.commit()
def main(): config = common.load_config() update_assembly_info(config) generate_version_file(config) update_readme(config)
current_value = a * b / c + value_breakpoint.value_min return current_value def get_breakpoints(self, value_type, sensor_value): return Breakpoints.query.filter( Breakpoints.value_min <= sensor_value, Breakpoints.value_max >= sensor_value, Breakpoints.sensor_value_type_id == value_type).first() def save_area_data(self, record_aqi, values, sensor_id): v = {} for key in values: if not isinstance(key, str): v[key.type] = values[key] else: v[key] = values[key] area = AreaModel(aqi=record_aqi, sensor_id=sensor_id, **v) self.db.session.add(area) self.db.session.commit() if __name__ == '__main__': load_config() app = create_app() app.app_context().push() calc = AQICalculator(db) result = calc.execute()
def upload(channel_directory, filename, filepath): config = load_config() dbx = dropbox.Dropbox(config['dropbox_api_access_token']) upload_chunk_size = config['dropbox_chunk_size_mb'] * 1024 * 1024 # Dropbox doesn't support characters defined outside the BMP. This includes most, but not all emoji. filename = re.sub(r'[^\u0000-\uffff]', '', filename) full_path = DROPBOX_ROOT / channel_directory / filename log.info(f'Full upload path is {full_path}') total_size = filepath.stat().st_size total_chunks = (total_size // upload_chunk_size) + 1 log.info(f'Uploading in {total_chunks} chunks') log.info('Starting session') session = dbx.files_upload_session_start(b'') uploaded = 0 hasher = DropboxContentHasher() for chunk_num in range(total_chunks): log.info(f'Uploading chunk {chunk_num}') is_last_chunk = chunk_num == total_chunks - 1 cursor = dropbox.files.UploadSessionCursor( session_id=session.session_id, offset=uploaded, ) upload_chunk(dbx, cursor, filepath, chunk_num, upload_chunk_size, hasher, is_last_chunk) uploaded += total_size % upload_chunk_size if is_last_chunk else upload_chunk_size log.info('Finishing session') file_metadata = dbx.files_upload_session_finish( b'', dropbox.files.UploadSessionCursor( session_id=session.session_id, offset=uploaded, ), dropbox.files.CommitInfo(path=str(full_path), ), ) local_hash = hasher.hexdigest() remote_hash = file_metadata.content_hash # TODO: Actually take some sort of action based on this, # especially considering that downloaders log to some file # no one will ever see. Just retry reuploading it maybe? assert local_hash == remote_hash, f'Local hash {local_hash} and remote hash {remote_hash} do not match' # Get the shared link and thumbnail # XXX: Maybe we should fetch the Youtube thumbnail way earlier? shared_link = dbx.sharing_create_shared_link(str(full_path)) _, thumbnail_resp = dbx.files_get_thumbnail( str(full_path), format=dropbox.files.ThumbnailFormat.png, size=dropbox.files.ThumbnailSize.w1024h768, ) # This is probably pretty brittle url = shared_link.url.replace("www.dropbox", "dl.dropboxusercontent") return (url, thumbnail_resp.content)
def setUp(self): config = load_config() config['io_threads'] = 1 self.ctx = FedMsgContext(**config)
def command(cls, config_ini, options, submissions_csv_filepath): # Inventive CSV. Columns: # applicationnumber, applicationdate, jobrole, laname, officerauthorised, theme, responsedate, acceptancestatus, odicertificateurl, dguurl, inventoryurl, localcodes, dataseturl, schemaurl, guidanceurl, frequencyofpublishing, foinumberest, submissioncomplete, lastlaupdate, techreviewstatus, lasttechupdate, adminreviewstatus, paymentamount, closed, lastadminupdate, applicantnotes, administrationnotes, technicalnotes, lastupdated with open(submissions_csv_filepath, 'rb') as f: csv = UnicodeCsvReader(f, encoding='iso-8859-1') header = csv.next() header = [col_name.strip().lower().replace(' ', '_') for col_name in header] Submission = namedtuple('Submission', header) submissions = [Submission(*row) for row in csv] if config_ini: # this is only for when running from the command-line #print 'Loading CKAN config...' common.load_config(config_ini) common.register_translator() #print '...done' from ckan import model from ckan.plugins import toolkit from ckanext.dgu.lib import helpers as dgu_helpers from ckanext.dgu.model.schema_codelist import Schema log = __import__('logging').getLogger(__name__) # Match the organizations in the submissions lga_orgs_by_dgu_org_name = {} accepted_submission_dgu_orgs = set() for submission in submissions: la_title = la_map.get(submission.laname, submission.laname) org = model.Session.query(model.Group) \ .filter_by(title=la_title) \ .first() assert org, 'Submission org title not found: %r' % la_title lga_orgs_by_dgu_org_name[org.name] = submission.laname if submission.acceptancestatus == 'Accepted': accepted_submission_dgu_orgs.add(org.name) stats = Stats() stats_incentive = Stats() results = [] if options.write: rev = model.repo.new_revision() rev.author = 'script-%s.py' % __file__ # Iterate over organizations if options.dataset: dataset = toolkit.get_action('package_show')(data_dict={'id': options.dataset}) org_names = [dataset['organization']['name']] elif options.organization: org_names = [options.organization] elif options.incentive_only: org_names = sorted(accepted_submission_dgu_orgs) else: org_names = dgu_helpers.all_la_org_names() #print '%s organizations' % len(org_names) for org_name in org_names: org_title = model.Group.by_name(org_name).title lga_org = lga_orgs_by_dgu_org_name.get(org_name) # Iterate over the schemas if options.schema: schema = all_schemas_by_dgu_name[options.schema] if options.incentive_only and not schema.lga_name: # not an incentive schema, so no results schemas = [] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.theme == schema.lga_name and submission.acceptancestatus == 'Accepted'] else: schemas = [all_schemas_by_lga_name.get( options.schema, schema)] elif options.incentive_only: schemas = [all_schemas_by_lga_name[submission.theme] for submission in submissions if submission.laname == lga_org and submission.acceptancestatus == 'Accepted'] else: schemas = all_schemas #print '%s schemas' % len(schemas) for schema in schemas: # Find the relevant incentive submission if lga_org: for submission in submissions: if submission.laname == lga_org and \ submission.theme == schema.lga_name: break else: submission = None else: submission = None result = dict( org_name=org_name, org_title=org_title, org_name_lga=submission.laname if submission else '', schema_dgu_title=schema.dgu_schema_name, schema_lga=schema.lga_name, lga_application_number=submission.applicationnumber if submission else '', lga_application_acceptance_status=submission.acceptancestatus if submission else '', dataset_names=[], dataset_titles=[], dataset_schema_applied=[], ) stat_id = '%s %s' % (org_name, schema.lga_name) if submission: stat_id += ' %s' % submission.applicationnumber def add_datasets_to_results(datasets, result): for dataset in datasets: if dataset['name'] not in result['dataset_names']: result['dataset_names'].append(dataset['name']) result['dataset_titles'].append(dataset['title']) schema_applied = True if schema.dgu_schema_name in \ [s['title'] for s in dataset.get('schema', [])] \ else False result['dataset_schema_applied'].append(schema_applied) if not schema_applied and options.write: pkg = model.Package.get(dataset['name']) schema_obj = Schema.by_title(schema.dgu_schema_name) assert schema_obj, schema.dgu_schema_name try: schema_ids = json.loads(pkg.extras.get('schema') or '[]') except ValueError: log.error('Not valid JSON in schema field: %s %r', dataset['name'], pkg.extras.get('schema')) schema_ids = [] schema_ids.append(schema_obj.id) pkg.extras['schema'] = json.dumps(schema_ids) # Already a schema? data_dict = {'fq': 'publisher:%s ' % org_name + 'schema_multi:"%s"' % schema.dgu_schema_name} datasets = toolkit.get_action('package_search')(data_dict=data_dict) if datasets['count'] > 0: add_datasets_to_results(datasets['results'], result) stats.add('OK - Dataset with schema', stat_id + ' %s' % ';'.join(result['dataset_names'])) found_schema = True else: found_schema = False # Submission specifies DGU dataset if submission and submission.dguurl: match = re.match('http://data.gov.uk/dataset/(.*)', submission.dguurl) if match: dataset_name = dataset_name_original = match.groups()[0] # some have trailing / dataset_name = dataset_name.strip('/') # hampshire have a hash appended if '#' in dataset_name: dataset_name = dataset_name.split('#')[0] # poole have a resource name appended if '/resource' in dataset_name: dataset_name = dataset_name.split('/resource')[0] # manual corrections if dataset_name in dataset_name_corrections: dataset_name = dataset_name_corrections[dataset_name] dataset = model.Package.by_name(dataset_name) # salford ones added a '1' if not dataset: dataset = model.Package.by_name(dataset_name + '1') if dataset: dataset_name += '1' if dataset and dataset.state == 'active': dataset_dict = toolkit.get_action('package_show')(data_dict={'id': dataset.id}) add_datasets_to_results([dataset_dict], result) if dataset_name != dataset_name_original: stats_incentive.add('OK - DGU Dataset listed and with corrections it checks out', stat_id + ' %s' % dataset_name) else: stats_incentive.add('OK - DGU Dataset listed and it checks out', stat_id + ' %s' % dataset_name) elif dataset: stats_incentive.add('ERROR - DGU Dataset listed BUT it is deleted!', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT it is not found', '%s %s' % (stat_id, submission.dguurl)) else: stats_incentive.add('ERROR - DGU Dataset listed BUT the URL is not the correct format', '%s %s' % (stat_id, submission.dguurl)) # Submission mentions dataset on LA site - maybe it is in DGU already? elif submission and submission.dataseturl: datasets = model.Session.query(model.Package) \ .join(model.ResourceGroup) \ .join(model.Resource) \ .filter(model.Resource.url==submission.dataseturl) \ .filter(model.Package.state=='active') \ .filter(model.Resource.state=='active') \ .all() dataset_dicts = [ toolkit.get_action('package_show')(data_dict={'id': dataset.id}) for dataset in datasets] add_datasets_to_results(dataset_dicts, result) if len(datasets) > 1: stats_incentive.add('No DGU Dataset, but Dataset URL matches multiple DGU datasets', '%s %s' % (stat_id, datasets[0].name)) elif len(datasets) == 0: stats_incentive.add('No DGU Dataset and Dataset URL not found on DGU', stat_id) else: stats_incentive.add('No DGU Dataset, but Dataset URL matches DGU dataset', '%s %s' % (stat_id, datasets[0].name)) # Search for datasets in the catalogue datasets = cls.find_dataset_for_schema(schema=schema, org_name=org_name) if datasets is None: if not found_schema: stats.add('Search revealed none', stat_id) elif len(datasets) > 1: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found datasets (multiple) in search', '%s %r' % (stat_id, [d['name'] for d in datasets])) elif datasets: add_datasets_to_results(datasets, result) if not found_schema: stats.add('Found dataset in search', '%s %s' % (stat_id, datasets[0]['name'])) else: if not found_schema: stats.add('No dataset for submission', stat_id) results.append(result) rows_with_datasets_count = \ len([result for result in results if any(result['dataset_schema_applied'])]) rows_with_datasets_or_candidate_datasets_count = \ len([result for result in results if result['dataset_schema_applied']]) if options.print_: print '\n Incentive stats\n' + stats_incentive.report() print '\n Overall stats\n' + stats.report() if options.write: print 'Writing' model.Session.commit() return {'table': results, 'rows_with_datasets_count': rows_with_datasets_count, 'rows_with_datasets_or_candidate_datasets_count': rows_with_datasets_or_candidate_datasets_count}
def init(): global config config = common.load_config()
finished = True start = time.time() for iso in get_isos(): if iso.match() and fileset.tas(iso.name): if not os.path.exists(iso.name): iso.download() # links are time-sensitive; if get_isos() is out-of-date will need # to re-fetch if time.time() > start + 300: finished = False break if __name__ == "__main__": config = common.load_config() args = parse_args() if args["list"]: for iso in get_isos(): print("[%c] %s" % ([" ", "*"][iso.match()], iso.name)) sys.exit(0) common.mkdirs(config["rhn-dumps-base"]) os.chdir(config["rhn-dumps-base"]) lock = common.Lock(".lock") threads = [] for i in range(int(config["rhn-dumps-threads"])): t = threading.Thread(target = worker, name = i)
if __name__ == '__main__': usage = """Tool to migrate QA data from TaskStatus to QA table usage: %prog [options] <ckan.ini> """ parser = OptionParser(usage=usage) parser.add_option("-w", "--write", action="store_true", dest="write", help="write the changes") parser.add_option('-p', '--publisher', dest='publisher') parser.add_option('-d', '--dataset', dest='dataset') parser.add_option('-r', '--resource', dest='resource') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Wrong number of arguments (%i)' % len(args)) config_ini = args[0] print 'Loading CKAN config...' common.load_config(config_ini) common.register_translator() print 'Done' # Setup logging to print debug out for local only rootLogger = logging.getLogger() rootLogger.setLevel(logging.WARNING) localLogger = logging.getLogger(__name__) localLogger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter(logging.Formatter('%(message)s')) localLogger.addHandler(handler) migrate(options)
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100) dataset_names.extend([dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add('Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources # NB This has been pulled out into timeseries_convert.py - # TODO call that instead of having the code here too. if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date(resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add('Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation'] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % (i+1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % (field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % (res['dataset_name'], res['id']) date_format = {'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY'} input_ = raw_input('Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print [res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace()
import socket import json import logging from common import load_config, file_default_config from datanode import DataNode if __name__ == "__main__": # socket.setdefaulttimeout(20) config = load_config("config/datanode.json") client_datanode_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) host = "" port = config["client_comm_port"] client_datanode_socket.bind((host, port)) client_datanode_socket.listen(config["max_client_num"]) datanode_instance = DataNode(config) while True: client, addr = client_datanode_socket.accept() print("Connect from {}".format(addr)) # receive command from client client_command = client.recv(2048).decode('utf-8') # accept command successfully client.send(bytes("ack", encoding="utf-8")) client_command = json.loads(client_command, encoding="utf-8") blockinfo = client_command.get("block_info", None) block = None ret_block = None
def main(): """ Main function: run Infernal, filter results and flag RNA genes in TRAPID db. """ cmd_args = parse_arguments() # Read experiment's initial processing configuration file config = common.load_config(cmd_args.ini_file_initial, {"infernal", "trapid_db", "experiment"}) # The web application sets the Rfam clan string to 'None' in case the user chose no clans # If this is the case, exit the script with an information message if config["infernal"]["rfam_clans"] == "None": sys.stderr.write( "[Message] No Rfam clans selected: skip ncRNA annotation step.\n") sys.exit() try: # Run Infernal, parse and export results to DB sys.stderr.write( '[Message] Starting ncRNA annotation procedure: %s\n' % time.strftime('%Y/%m/%d %H:%M:%S')) exp_id = config["experiment"]["exp_id"] tmp_exp_dir = config["experiment"]["tmp_exp_dir"] rfam_dir = config["infernal"]["rfam_dir"] exp_clans = config["infernal"]["rfam_clans"].split(",") # Lists containing all needed parameters for `common.db_connect()` (TRAPID + reference DB) trapid_db_data = common.get_db_connection_data(config, 'trapid_db') reference_db_data = common.get_db_connection_data( config, 'reference_db') db_connection = common.db_connect(*trapid_db_data) common.update_experiment_log(exp_id, 'start_nc_rna_search', 'Infernal', 2, db_connection) db_connection.close() create_infernal_files(exp_id, tmp_exp_dir, rfam_dir, exp_clans, trapid_db_data) # run_cmpress(exp_id=exp_id, tmp_exp_dir=tmp_exp_dir) total_m_nts = get_infernal_z_value(exp_id, trapid_db_data) infernal_tblout = run_infernal(exp_id, tmp_exp_dir, total_m_nts) # Filter Infernal tabulated output (keep best non-ovelrapping matches) # infernal_tblout_filtered = filter_out_overlaps(exp_id=exp_id, tmp_exp_dir=tmp_exp_dir, tblout_file=infernal_tblout) infernal_tblout_filtered = keep_best_results(exp_id, tmp_exp_dir, infernal_tblout) # Get filtered results as list of dict and add clan information # Read Rfam clan information from `clanin` file. Would it make more sense to retrieve it when creating it? cm_clans = get_exp_cm_clans(exp_id, tmp_exp_dir) filtered_infernal_results = infernal_tblout_to_list( infernal_tblout_filtered, cm_clans) infernal_results = infernal_tblout_to_list(infernal_tblout, cm_clans) # Flag potential rna genes (set `is_rna_gene` value to 1 and `rf_ids` in `transcripts` table) flag_rna_genes(exp_id, filtered_infernal_results, trapid_db_data) # Store filtered results in `rna_similarities` ... store_rna_similarities(exp_id, infernal_results, trapid_db_data) # ... and `rna_families` store_rna_families(exp_id, filtered_infernal_results, trapid_db_data) # Annotate transcripts using GO terms from Rfam rfam_go = retrieve_rfam_go_data(trapid_db_data) go_data = get_go_data(reference_db_data) # perform_go_annotation(exp_id, infernal_results, rfam_go, go_data, tmp_exp_dir) perform_go_annotation(exp_id, filtered_infernal_results, rfam_go, go_data, tmp_exp_dir) # That's it for now db_connection = common.db_connect(*trapid_db_data) common.update_experiment_log(exp_id, 'stop_nc_rna_search', 'Infernal', 2, db_connection) db_connection.close() sys.stderr.write( '[Message] Finished ncRNA annotation procedure: %s\n' % time.strftime('%Y/%m/%d %H:%M:%S')) # If any exception was raised, update the experiment's log, set status to 'error', and exit except Exception: print_exc() common.stop_initial_processing_error(exp_id, trapid_db_data)
import socket import json from common import load_config from namenoode import FileSystem normal_message = {"success": True, "message": "ack"} normal_message_bytes = bytes(json.dumps(normal_message).encode('utf-8')) if __name__ == "__main__": # socket.setdefaulttimeout(20) config = load_config("config/namenode.json") client_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) host = "" port = config["client_comm_port"] client_server_socket.bind((host, port)) client_server_socket.listen(config["max_client_num"]) file_system = FileSystem(config=config["file_system_config"]) while True: client, addr = client_server_socket.accept() print("client address: {}".format(addr)) # recv command from client client_command = client.recv(4096).decode('utf-8') client_command = json.loads(client_command, encoding='utf-8') if client_command["command"] == "test_list": file_system.test_out() client.send(normal_message_bytes) if client_command["command"] == "list":