def main(): parser = argparse.ArgumentParser( prog='dbmanager.py', usage='%(prog)s [options]', description= '''tool for managing the conversion of raw data in different formats into elasticsearch''', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '-c', '--config', dest='config', default="{}/{}".format('${PWD}', 'config.yml'), help='specify a config file path to use for the cli/daemon') parser.add_argument('-i', '--index', dest='i', action='store_true', help='index source data into elasticsearch') parser.add_argument('-ci', '--create-index', dest='ci', action='store_true', help='create elasticsearch index') parser.add_argument('-ri', '--re-index', dest='ri', action='store_true', help='re-index source data into elasticsearch') parser.add_argument('-di', '--delete-idex', dest='di', action='store_true', help='delete elasticsearch index') parser.add_argument('-d', '--display', dest='display', action='store_true', help='display information about available data sets') args = vars(parser.parse_args()) if 'PWD' in args['config']: config = utils.load_config( os.path.join(os.path.dirname(__file__), 'config', 'config.yaml')) else: config = utils.load_config(args['config']) if not config: raise Exception("EXITING:FAILED_TO_LOAD_CONFIG") manager = DBManager(config) if args['i']: manager.index() if args['ci']: manager.create_index() if args['ri']: manager.reindex() if args['di']: manager.delete_index() if args['display']: manager.display_information()
def find_candidate_clusters(oclcs): # APPLICATION SETUP # load environment env = Env() env.read_env() ROOT_PATH = os.environ.get("ZEPHIR_ROOT_PATH") or os.path.join( os.path.dirname(__file__)) ENV = os.environ.get("ZEPHIR_ENV") CONFIG_PATH = os.environ.get("ZEPHIR_CONFIG_PATH") or os.path.join( ROOT_PATH, "config") OVERRIDE_CONFIG_PATH = os.environ.get("ZEPHIR_OVERRIDE_CONFIG_PATH") # load all configuration files in directory config = utils.load_config(CONFIG_PATH) # used in testing, config files in test data will override local config files if OVERRIDE_CONFIG_PATH is not None: config = utils.load_config(OVERRIDE_CONFIG_PATH, config) db = config.get("database", {}).get(ENV) sql_select = ( f"select zr.cid from zephir_identifier_records zir " f"join zephir_identifiers zi on zir.identifier_autoid = zi.autoid " f"join zephir_records zr on zr.autoid = zir.record_autoid " f"where (type='oclc' and identifier in ('1570562')) " f"or (type = 'contrib_sys_id' and identifier= '') " f"order by zi.type desc, cid; ") candidate_list = [] try: conn_args = { "user": db.get("username", None), "password": db.get("password", None), "host": db.get("host", None), "database": db.get("database", None), "unix_socket": None, } socket = os.environ.get("ZEPHIR_DB_SOCKET") or config.get("socket") if socket: conn_args["unix_socket"] = socket conn = mysql.connector.connect(**conn_args) cursor = conn.cursor() cursor.execute(sql_select) for idx, cid_row_result in enumerate(cursor): candidate_list.append(cid_row_result[0]) finally: cursor.close() conn.close() return candidate_list
def __init__(self, config): self.config = config self.logger = Logger('logs/debug.log').log self.mongo = MongoConnector(load_config('conf/db.conf.json')) self.main_page_selectors = config['selectors']['main_page'] self.post_page_selectors = config['selectors']['post_page'] self.base_url = config['base_url']
def main(do_test=False, job_title='', cwd=None): if cwd is None: config_overwrites = { 'job_title': job_title, 'n_batches_preview': 0, } config = gen_config(config_overwrites) else: config = load_config(cwd) if do_test: test_config = config.copy() test_config.update({ 'is_test_run': True, 'job_title': 'test_run', 'n_batches_preview': 0, 'subsampling': 256, 'submission_subsampling': 32, 'steps_per_epoch': 1, 'steps_per_epoch_for_valid': 1, 'pretraining_n_epochs': 1, 'n_epochs': 100, 'lr_scan_n_epochs': 25, }) train_wrapper(test_config) if do_test != 'only': train_wrapper(config)
def get_configs_by_filename(config_dir_name, config_file): """return configs defined in the config_file as a dictionary config_dir: directory of configuration files config_file: configuration filename """ ROOT_PATH = os.path.dirname(os.path.abspath(__file__)) CONFIG_PATH = os.path.join(ROOT_PATH, config_dir_name) # load all configuration files in directory configs = utils.load_config(CONFIG_PATH) return configs.get(config_file)
def main(cwd): config = load_config(cwd) # os.environ['CUDA_VISIBLE_DEVICES'] = config['cuda_visible_devices'] or '0,1,2,3,4,5,6,7' # avail_gpus = GPUtil.getAvailable(limit=100) # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, avail_gpus)) os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3' debug( f"os.environ['CUDA_VISIBLE_DEVICES'] = {os.environ['CUDA_VISIBLE_DEVICES']}" ) print_config(config) after_training_for_folds(config)
def main(): config = utils.load_config(__file__) data = utils.load_data(__file__) count = {} for code in data: for interface in data[code]['interfaces']: is_active = config['active'][code][interface] residues = data[code]['interfaces'][interface]['residues'] if is_active: for r in residues: acid = r['resn'].encode('utf-8') if acid in count: count[acid] += 1 else: count[acid] = 1 total = sum(count.values()) for acid in sorted(count.keys()): print acid, '{:1.1f}'.format(100 * count[acid] / float(total)), u'({0} / {1})'.format(count[acid], total)
def extract_from_html_dir(self, html_dir_path): map = { "公告id": [], "甲方": [], "乙方": [], "项目名称": [], "合同名称": [], "合同金额上限": [], "合同金额下限": [], "联合体成员": [] } config = load_config(FLAGS.resource.config_file2) with open(FLAGS.resource.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.resource.ckpt_dir, load_word2vec, config, id_to_char, False) trans = model.trans.eval() for html_id in tqdm(os.listdir(html_dir_path)): self._extract_from_html_dir(html_dir_path, html_id, map, sess, trans, model, id_to_tag, tag_to_id, char_to_id) dataframe = pd.DataFrame(data=map, columns=[ "公告id", "甲方", "乙方", "项目名称", "合同名称", "合同金额上限", "合同金额下限", "联合体成员" ], dtype=None, copy=False) if os.path.exists('ht_result.csv'): os.remove('ht_result.csv') dataframe.to_csv("ht_result.csv", encoding="utf_8_sig")
epoch_it=epoch_it, it=it, loss_val_best=metric_val_best) # Quit after the maximum number of epochs is reached logger.info( 'Training completed after {} Epochs ({} it) with best val metric ({})={}' .format(epoch_it, it, model_selection_metric, metric_val_best)) if __name__ == "__main__": logger = logging.getLogger parser = argparse.ArgumentParser() parser.add_argument('config', type=str, help='Path to the config file.') args = parser.parse_args() cfg = load_config(args.config) # Create the output dir if it does not exist if not os.path.exists(cfg['misc']['log_dir']): os.makedirs(cfg['misc']['log_dir']) logger, checkpoint_dir = prepare_logger(cfg, cfg['misc']['log_path']) cfg['misc']['log_dir'] = checkpoint_dir # Argument: path to the config file logger.info('Torch version: {}'.format(torch.__version__)) main(cfg, logger)
def train(FLAGS): # load data sets train_sentences = load_sentences(FLAGS.resource.train_file, FLAGS.trainer.zeros) test_sentences = load_sentences(FLAGS.resource.test_file, FLAGS.trainer.zeros) update_tag_scheme( train_sentences, FLAGS.model.tag_schema) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(test_sentences, FLAGS.model.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.resource.map_file): if FLAGS.trainer.pre_emb: # create dictionary for word dico_chars_train = char_mapping(train_sentences, FLAGS.trainer.lower)[0] _, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.resource.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.trainer.lower) _, tag_to_id, id_to_tag = tag_mapping( train_sentences) # Create a dictionary and a mapping for tags with open(FLAGS.resource.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.resource.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.trainer.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.trainer.lower) train_manager = BatchManager(train_data, FLAGS.trainer.batch_size) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist if os.path.isfile(FLAGS.resource.config_file2): config = load_config(FLAGS.resource.config_file2) else: config = _config_model(FLAGS, char_to_id, tag_to_id) save_config(config, FLAGS.resource.config_file2) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.resource.ckpt_dir, load_word2vec, config, id_to_char) logger.info("Start raining") loss = [] for i in range(FLAGS.trainer.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.trainer.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = return_f1(FLAGS, sess, model, "test", test_manager, id_to_tag) if best: save_model(sess, model, FLAGS.resource.ckpt_dir)
help='Batch size', required=False) parser.add_argument('--run_test', action='store_true', required=False, default=False) parser.add_argument('--suppress_deprecated', action='store_true', required=False, default=False) parser.add_argument('--show_progress', type=str2bool, required=False) parser.add_argument('--log_file', type=str, required=False) args = parser.parse_args() if args.run_test: model_config = load_config('config/test_model_config.yaml') else: model_config = load_config('config/model_config.yaml') train_config = load_config('config/train_config.yaml') if args.suppress_deprecated: warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=UserWarning) if args.show_progress is not None: train_config['show_progress'] = args.show_progress if args.log_file: if os.path.exists(args.log_file): os.remove(args.log_file) set_file_logger(args.log_file)
def audit(filepath, quiet, verbose, dry_run, suffix): """Audit.py: Audit ZED log file to ensure all the data is represented in the database""" # Print handler to manage when and how messages should print console = ConsoleMessenger(quiet, verbose) # REQUIREMENTS if len(filepath) == 0: console.error("No files given to process.") sys.exit(1) # APPLICATION SETUP # load environment env = environs.Env() env.read_env() ROOT_PATH = os.environ.get("ZED_ROOT_PATH") or os.path.dirname(__file__) ENV = os.environ.get("ZED_ENV") CONFIG_PATH = os.environ.get("ZED_CONFIG_PATH") or os.path.join( ROOT_PATH, "config") OVERRIDE_CONFIG_PATH = os.environ.get("ZED_OVERRIDE_CONFIG_PATH") # load all configuration files in directory config = utils.load_config(CONFIG_PATH) # used in testing, config files in test data will override local config files if OVERRIDE_CONFIG_PATH is not None: config = utils.load_config(OVERRIDE_CONFIG_PATH, config) # Print handler to manage when/where messages should print console = ConsoleMessenger(quiet, verbose) # DATABASE SETUP # Create database client, connection manager. db = config.get("zed_db", {}).get(ENV) DB_CONNECT_STR = str(utils.db_connect_url(db)) engine = sqla.create_engine(DB_CONNECT_STR) # Create classes through reflection Base = sqla_automap.automap_base() Base.prepare(engine, reflect=True) Event = Base.classes.events # Create a session to the database. Session = sqla.orm.sessionmaker() Session.configure(bind=engine) session = Session() if dry_run: console.diagnostic("DRY RUN") # Iterate over the json log files to process for file in filepath: if not os.path.isfile(file): console.error( "File path '{0}' does not exist. Exiting...".format(file)) break # # Get the file name, path, and create destination file name, path f_path, f_name = os.path.split(file) renamed_file = os.path.join("{0}.{1}".format(file, suffix)) if os.path.isfile(renamed_file): console.error( "Audit file '{0}' already exists.".format(renamed_file)) break log_events = [] db_events = set() file_pass = True # Assume valid until line found invalid # Open file and process with open(file) as f_io: ln_cnt = 0 console.diagnostic("Auditing: " + file) for line in f_io: ln_cnt += 1 try: log_events.append(json.loads(line.strip())) except json.decoder.JSONDecodeError: file_pass = False console.error( "ERROR: Innvalid JSON on line {0}".format(ln_cnt)) break # invalid json, stop successive validation routines if file_pass and len(log_events) > 0: query_params = { "event_type": log_events[0]["type"], "first_timestamp": (iso8601.parse_date(log_events[0]["timestamp"]) - datetime.timedelta(seconds=60)).isoformat("T"), "last_timestamp": (iso8601.parse_date(log_events[-1]["timestamp"]) + datetime.timedelta(seconds=60)).isoformat("T"), } session = Session() try: query = (session.query(Event.event_key).filter( Event.timestamp >= query_params["first_timestamp"]).filter( Event.timestamp <= query_params["last_timestamp"]). filter(Event.type == query_params["event_type"])) for event in query.all(): db_events.add(event.event_key) except Exception as e: session.rollback() raise e finally: session.close() for event in log_events: if not event["event"] in db_events: file_pass = False console.error( "ERROR: Missing event {0} in database.".format( event["event"])) # Report results if file_pass is False: console.error("File {0}: fail.".format(file)) else: if not dry_run: os.rename(file, renamed_file) console.report("File {0}: pass. {1} event(s) audited.\ ".format(file, len(log_events))) console.report("Done!") sys.exit(0)
def get_content_from_post(self, url): html = requests.get(url) tree = etree.HTML(html.text) post_content = process_document_text("".join( tree.xpath('.//{}[contains(@class, "{}")]//text()'.format( self.post_page_selectors['content']['tag'], self.post_page_selectors['content']['selector'])))).strip() return post_content def create_inverted_index(self, content, document_id): inverted_index = {} forward_index = { index: word for index, word in enumerate(content.split()) } for index, word in forward_index.items(): word_normalized = normalize_word(word) inverted_index.setdefault(word_normalized, []).append(index) return inverted_index @staticmethod def dom_element_get_children(root, selector_data): return root.xpath('.//{}[contains(@class, "{}")]'.format( selector_data['tag'], selector_data['selector'])) if __name__ == '__main__': crawler = Crawler(load_config('conf/crawler.conf.json')) crawler.update_database()
request.args.get('start') or timestamp_day_decrement()) end_timestamp = int(request.args.get('end') or timestamp_today()) word = normalize_word(request.args.get('word').lower()) if not word: return jsonify({'message': 'word not specified'}), 400 index_by_documents = mongo.filter_index_by_documents( start_timestamp, end_timestamp) documents_number = len(index_by_documents) documents_with_word = 0 for document_index in index_by_documents: if word in document_index: documents_with_word += 1 if documents_with_word == 0: return jsonify({'message': 'word does not appear in documents'}), 400 idf = math.log10(documents_number / documents_with_word) return jsonify({'word': word, 'idf': idf}) if __name__ == '__main__': mongo = MongoConnector(load_config('conf/db.conf.json')) logger = Logger('logs/debug.log').log config = load_config('conf/app.conf.json') assert isinstance(config['port'], int) crawler = Crawler(load_config('conf/crawler.conf.json')) app = Flask(__name__) app.register_blueprint(api) app.run(host=config['host'], port=config['port'], debug=True)
face_model_intepreter (tflite_runtime.interpreter): Instance of tflite_runtime.interpreter for face model telegram_people (Dict): Contains name:id pairs of all authorized telegram users telegram_token (String): The token needed to use the bot """ self.ip_cam_objects = ip_cam_objects # self.tf_intepreter = tf_intepreter self.person_model = person_model self.face_model = face_model self.telegram_people = telegram_people self.telegram_ids = set(telegram_people.values()) self.telegram_token = telegram_token if __name__ == "__main__": # --- Load options from config --- config_dict = utils.load_config() ip_cams = config_dict["ip_cams"] # load up list of ip_cams telegram_token = config_dict[ "telegram_token"] # load token for telegram bot people = config_dict["people"] # Load up dict of people for telegram bot # Create the videostream objects for each ip cam ip_cam_objects = { ip_cam: VideoStream(ip_cams[ip_cam], ip_cam) for ip_cam in ip_cams } # Pre-load tf models person_model, face_model = tflite.load_models()
def main(): parser = argparse.ArgumentParser( prog='dbmanager.py', usage='%(prog)s [options]', description= '''tool for managing the conversion of raw data in different formats into elasticsearch''', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '-c', '--config', dest='config', default="{}/{}".format('${PWD}', 'config.yml'), help='specify a config file path to use for the cli/daemon') parser.add_argument('-in', '--index-names', dest='in', action='store_true', help='index available names in data') parser.add_argument('-ip', '--index-nutrients', dest='ip', action='store_true', help='index available nutrients from data') parser.add_argument( '-rin', '--reindex-names', dest='rin', action='store_true', help= 're-index available names in data and re-configure index. WARNING: WILL DELETE EXISTING ' 'INDEX') parser.add_argument( '-rip', '--reindex-nutrients', dest='rip', action='store_true', help= 're-index available nutrients in data and re-configure index. WARNING: WILL DELETE EXISTING ' 'INDEX') parser.add_argument('-up', '--upload-profiles', dest='up', action='store_true', help='upload normalised food profiles into firebase') parser.add_argument( '-rt', '--remote-tunnel', dest='rt', action='store_true', default=False, help='fetch data from a remote mongodb instance via a SSH tunnel') parser.add_argument('-d', '--display', dest='display', action='store_true', help='display information about available data sets') args = vars(parser.parse_args()) if 'PWD' in args['config']: config = utils.load_config( os.path.join(os.path.dirname(__file__), 'config', 'config.yml')) else: config = utils.load_config(args['config']) if not config: raise Exception("EXITING:FAILED_TO_LOAD_CONFIG") manager = DBManager(config, remote=args['rt']) if args['in']: manager.index_names() if args['ip']: manager.index_nutrients() if args['rip']: manager.reindex_nutrients() if args['rin']: manager.reindex_names() if args['display']: manager.display_information() if args['up']: manager.upload_foodprofiles()