Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        prog='dbmanager.py',
        usage='%(prog)s [options]',
        description=
        '''tool for managing the conversion of raw data in different formats 
                                     into elasticsearch''',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '-c',
        '--config',
        dest='config',
        default="{}/{}".format('${PWD}', 'config.yml'),
        help='specify a config file path to use for the cli/daemon')
    parser.add_argument('-i',
                        '--index',
                        dest='i',
                        action='store_true',
                        help='index source data into elasticsearch')
    parser.add_argument('-ci',
                        '--create-index',
                        dest='ci',
                        action='store_true',
                        help='create elasticsearch index')
    parser.add_argument('-ri',
                        '--re-index',
                        dest='ri',
                        action='store_true',
                        help='re-index source data into elasticsearch')
    parser.add_argument('-di',
                        '--delete-idex',
                        dest='di',
                        action='store_true',
                        help='delete elasticsearch index')
    parser.add_argument('-d',
                        '--display',
                        dest='display',
                        action='store_true',
                        help='display information about available data sets')

    args = vars(parser.parse_args())
    if 'PWD' in args['config']:
        config = utils.load_config(
            os.path.join(os.path.dirname(__file__), 'config', 'config.yaml'))
    else:
        config = utils.load_config(args['config'])
    if not config:
        raise Exception("EXITING:FAILED_TO_LOAD_CONFIG")
    manager = DBManager(config)

    if args['i']:
        manager.index()
    if args['ci']:
        manager.create_index()
    if args['ri']:
        manager.reindex()
    if args['di']:
        manager.delete_index()
    if args['display']:
        manager.display_information()
    def find_candidate_clusters(oclcs):

        # APPLICATION SETUP
        # load environment
        env = Env()
        env.read_env()

        ROOT_PATH = os.environ.get("ZEPHIR_ROOT_PATH") or os.path.join(
            os.path.dirname(__file__))
        ENV = os.environ.get("ZEPHIR_ENV")
        CONFIG_PATH = os.environ.get("ZEPHIR_CONFIG_PATH") or os.path.join(
            ROOT_PATH, "config")
        OVERRIDE_CONFIG_PATH = os.environ.get("ZEPHIR_OVERRIDE_CONFIG_PATH")

        # load all configuration files in directory
        config = utils.load_config(CONFIG_PATH)

        # used in testing, config files in test data will override local config files
        if OVERRIDE_CONFIG_PATH is not None:
            config = utils.load_config(OVERRIDE_CONFIG_PATH, config)

        db = config.get("database", {}).get(ENV)

        sql_select = (
            f"select zr.cid from zephir_identifier_records zir "
            f"join zephir_identifiers zi on zir.identifier_autoid = zi.autoid "
            f"join zephir_records zr on zr.autoid = zir.record_autoid "
            f"where (type='oclc' and identifier in ('1570562')) "
            f"or (type = 'contrib_sys_id' and identifier= '') "
            f"order by zi.type desc, cid; ")

        candidate_list = []
        try:
            conn_args = {
                "user": db.get("username", None),
                "password": db.get("password", None),
                "host": db.get("host", None),
                "database": db.get("database", None),
                "unix_socket": None,
            }

            socket = os.environ.get("ZEPHIR_DB_SOCKET") or config.get("socket")

            if socket:
                conn_args["unix_socket"] = socket

            conn = mysql.connector.connect(**conn_args)

            cursor = conn.cursor()
            cursor.execute(sql_select)

            for idx, cid_row_result in enumerate(cursor):
                candidate_list.append(cid_row_result[0])

        finally:
            cursor.close()
            conn.close()

        return candidate_list
Beispiel #3
0
 def __init__(self, config):
     self.config = config
     self.logger = Logger('logs/debug.log').log
     self.mongo = MongoConnector(load_config('conf/db.conf.json'))
     self.main_page_selectors = config['selectors']['main_page']
     self.post_page_selectors = config['selectors']['post_page']
     self.base_url = config['base_url']
Beispiel #4
0
def main(do_test=False, job_title='', cwd=None):
    if cwd is None:
        config_overwrites = {
            'job_title': job_title,
            'n_batches_preview': 0,
        }
        config = gen_config(config_overwrites)
    else:
        config = load_config(cwd)

    if do_test:
        test_config = config.copy()
        test_config.update({
            'is_test_run': True,
            'job_title': 'test_run',
            'n_batches_preview': 0,
            'subsampling': 256,
            'submission_subsampling': 32,
            'steps_per_epoch': 1,
            'steps_per_epoch_for_valid': 1,
            'pretraining_n_epochs': 1,
            'n_epochs': 100,
            'lr_scan_n_epochs': 25,
        })
        train_wrapper(test_config)

    if do_test != 'only':
        train_wrapper(config)
Beispiel #5
0
def get_configs_by_filename(config_dir_name, config_file):
    """return configs defined in the config_file as a dictionary
       config_dir: directory of configuration files
       config_file: configuration filename
    """
    ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
    CONFIG_PATH = os.path.join(ROOT_PATH, config_dir_name)

    # load all configuration files in directory
    configs = utils.load_config(CONFIG_PATH)

    return configs.get(config_file)
Beispiel #6
0
def main(cwd):
    config = load_config(cwd)

    # os.environ['CUDA_VISIBLE_DEVICES'] = config['cuda_visible_devices'] or '0,1,2,3,4,5,6,7'

    # avail_gpus = GPUtil.getAvailable(limit=100)
    # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, avail_gpus))

    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'

    debug(
        f"os.environ['CUDA_VISIBLE_DEVICES'] = {os.environ['CUDA_VISIBLE_DEVICES']}"
    )

    print_config(config)

    after_training_for_folds(config)
Beispiel #7
0
def main():
    config = utils.load_config(__file__)
    data = utils.load_data(__file__)

    count = {}

    for code in data:
        for interface in data[code]['interfaces']:
            is_active = config['active'][code][interface]
            residues = data[code]['interfaces'][interface]['residues']

            if is_active:
                for r in residues:
                    acid = r['resn'].encode('utf-8')
                    if acid in count:
                        count[acid] += 1
                    else:
                        count[acid] = 1

    total = sum(count.values())

    for acid in sorted(count.keys()):
        print acid, '{:1.1f}'.format(100 * count[acid] / float(total)), u'({0} / {1})'.format(count[acid], total)
Beispiel #8
0
    def extract_from_html_dir(self, html_dir_path):
        map = {
            "公告id": [],
            "甲方": [],
            "乙方": [],
            "项目名称": [],
            "合同名称": [],
            "合同金额上限": [],
            "合同金额下限": [],
            "联合体成员": []
        }

        config = load_config(FLAGS.resource.config_file2)
        with open(FLAGS.resource.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        with tf.Session(config=tf_config) as sess:
            model = create_model(sess, Model, FLAGS.resource.ckpt_dir,
                                 load_word2vec, config, id_to_char, False)
            trans = model.trans.eval()
            for html_id in tqdm(os.listdir(html_dir_path)):
                self._extract_from_html_dir(html_dir_path, html_id, map, sess,
                                            trans, model, id_to_tag, tag_to_id,
                                            char_to_id)

        dataframe = pd.DataFrame(data=map,
                                 columns=[
                                     "公告id", "甲方", "乙方", "项目名称", "合同名称",
                                     "合同金额上限", "合同金额下限", "联合体成员"
                                 ],
                                 dtype=None,
                                 copy=False)
        if os.path.exists('ht_result.csv'):
            os.remove('ht_result.csv')
        dataframe.to_csv("ht_result.csv", encoding="utf_8_sig")
Beispiel #9
0
                                       epoch_it=epoch_it,
                                       it=it,
                                       loss_val_best=metric_val_best)

    # Quit after the maximum number of epochs is reached
    logger.info(
        'Training completed after {} Epochs ({} it) with best val metric ({})={}'
        .format(epoch_it, it, model_selection_metric, metric_val_best))


if __name__ == "__main__":
    logger = logging.getLogger

    parser = argparse.ArgumentParser()
    parser.add_argument('config', type=str, help='Path to the config file.')
    args = parser.parse_args()

    cfg = load_config(args.config)

    # Create the output dir if it does not exist
    if not os.path.exists(cfg['misc']['log_dir']):
        os.makedirs(cfg['misc']['log_dir'])

    logger, checkpoint_dir = prepare_logger(cfg, cfg['misc']['log_path'])

    cfg['misc']['log_dir'] = checkpoint_dir
    # Argument: path to the config file
    logger.info('Torch version: {}'.format(torch.__version__))

    main(cfg, logger)
Beispiel #10
0
def train(FLAGS):
    # load data sets
    train_sentences = load_sentences(FLAGS.resource.train_file,
                                     FLAGS.trainer.zeros)
    test_sentences = load_sentences(FLAGS.resource.test_file,
                                    FLAGS.trainer.zeros)
    update_tag_scheme(
        train_sentences,
        FLAGS.model.tag_schema)  # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(test_sentences, FLAGS.model.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.resource.map_file):
        if FLAGS.trainer.pre_emb:  # create dictionary for word
            dico_chars_train = char_mapping(train_sentences,
                                            FLAGS.trainer.lower)[0]
            _, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.resource.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _, char_to_id, id_to_char = char_mapping(train_sentences,
                                                     FLAGS.trainer.lower)
        _, tag_to_id, id_to_tag = tag_mapping(
            train_sentences)  # Create a dictionary and a mapping for tags
        with open(FLAGS.resource.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.resource.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.trainer.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.trainer.lower)
    train_manager = BatchManager(train_data, FLAGS.trainer.batch_size)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    if os.path.isfile(FLAGS.resource.config_file2):
        config = load_config(FLAGS.resource.config_file2)
    else:
        config = _config_model(FLAGS, char_to_id, tag_to_id)
        save_config(config, FLAGS.resource.config_file2)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.resource.ckpt_dir,
                             load_word2vec, config, id_to_char)
        logger.info("Start raining")
        loss = []
        for i in range(FLAGS.trainer.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.trainer.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = return_f1(FLAGS, sess, model, "test", test_manager,
                             id_to_tag)
            if best:
                save_model(sess, model, FLAGS.resource.ckpt_dir)
Beispiel #11
0
                        help='Batch size',
                        required=False)
    parser.add_argument('--run_test',
                        action='store_true',
                        required=False,
                        default=False)
    parser.add_argument('--suppress_deprecated',
                        action='store_true',
                        required=False,
                        default=False)
    parser.add_argument('--show_progress', type=str2bool, required=False)
    parser.add_argument('--log_file', type=str, required=False)
    args = parser.parse_args()

    if args.run_test:
        model_config = load_config('config/test_model_config.yaml')
    else:
        model_config = load_config('config/model_config.yaml')
    train_config = load_config('config/train_config.yaml')

    if args.suppress_deprecated:
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        warnings.filterwarnings("ignore", category=UserWarning)
    if args.show_progress is not None:
        train_config['show_progress'] = args.show_progress

    if args.log_file:
        if os.path.exists(args.log_file):
            os.remove(args.log_file)
        set_file_logger(args.log_file)
Beispiel #12
0
def audit(filepath, quiet, verbose, dry_run, suffix):
    """Audit.py: Audit ZED log file to ensure all the data is represented in
    the database"""
    # Print handler to manage when and how messages should print
    console = ConsoleMessenger(quiet, verbose)

    # REQUIREMENTS
    if len(filepath) == 0:
        console.error("No files given to process.")
        sys.exit(1)

    # APPLICATION SETUP
    # load environment
    env = environs.Env()
    env.read_env()

    ROOT_PATH = os.environ.get("ZED_ROOT_PATH") or os.path.dirname(__file__)
    ENV = os.environ.get("ZED_ENV")
    CONFIG_PATH = os.environ.get("ZED_CONFIG_PATH") or os.path.join(
        ROOT_PATH, "config")
    OVERRIDE_CONFIG_PATH = os.environ.get("ZED_OVERRIDE_CONFIG_PATH")

    # load all configuration files in directory
    config = utils.load_config(CONFIG_PATH)

    # used in testing, config files in test data will override local config files
    if OVERRIDE_CONFIG_PATH is not None:
        config = utils.load_config(OVERRIDE_CONFIG_PATH, config)

    # Print handler to manage when/where messages should print
    console = ConsoleMessenger(quiet, verbose)

    # DATABASE SETUP
    # Create database client, connection manager.
    db = config.get("zed_db", {}).get(ENV)

    DB_CONNECT_STR = str(utils.db_connect_url(db))

    engine = sqla.create_engine(DB_CONNECT_STR)

    # Create classes through reflection
    Base = sqla_automap.automap_base()
    Base.prepare(engine, reflect=True)
    Event = Base.classes.events

    # Create a session to the database.
    Session = sqla.orm.sessionmaker()
    Session.configure(bind=engine)
    session = Session()

    if dry_run:
        console.diagnostic("DRY RUN")

    # Iterate over the json log files to process
    for file in filepath:

        if not os.path.isfile(file):
            console.error(
                "File path '{0}' does not exist. Exiting...".format(file))
            break

        # # Get the file name, path, and create destination file name, path
        f_path, f_name = os.path.split(file)
        renamed_file = os.path.join("{0}.{1}".format(file, suffix))

        if os.path.isfile(renamed_file):
            console.error(
                "Audit file '{0}' already exists.".format(renamed_file))
            break

        log_events = []
        db_events = set()
        file_pass = True  # Assume valid until line found invalid
        # Open file and process
        with open(file) as f_io:
            ln_cnt = 0
            console.diagnostic("Auditing: " + file)
            for line in f_io:
                ln_cnt += 1
                try:
                    log_events.append(json.loads(line.strip()))
                except json.decoder.JSONDecodeError:
                    file_pass = False
                    console.error(
                        "ERROR: Innvalid JSON on line {0}".format(ln_cnt))
                    break  # invalid json, stop successive validation routines

        if file_pass and len(log_events) > 0:
            query_params = {
                "event_type":
                log_events[0]["type"],
                "first_timestamp":
                (iso8601.parse_date(log_events[0]["timestamp"]) -
                 datetime.timedelta(seconds=60)).isoformat("T"),
                "last_timestamp":
                (iso8601.parse_date(log_events[-1]["timestamp"]) +
                 datetime.timedelta(seconds=60)).isoformat("T"),
            }

            session = Session()
            try:
                query = (session.query(Event.event_key).filter(
                    Event.timestamp >= query_params["first_timestamp"]).filter(
                        Event.timestamp <= query_params["last_timestamp"]).
                         filter(Event.type == query_params["event_type"]))

                for event in query.all():
                    db_events.add(event.event_key)
            except Exception as e:
                session.rollback()
                raise e
            finally:
                session.close()

            for event in log_events:
                if not event["event"] in db_events:
                    file_pass = False
                    console.error(
                        "ERROR: Missing event {0} in database.".format(
                            event["event"]))

        # Report results
        if file_pass is False:
            console.error("File {0}: fail.".format(file))
        else:
            if not dry_run:
                os.rename(file, renamed_file)
            console.report("File {0}: pass. {1} event(s) audited.\
            ".format(file, len(log_events)))

    console.report("Done!")
    sys.exit(0)
Beispiel #13
0
    def get_content_from_post(self, url):
        html = requests.get(url)
        tree = etree.HTML(html.text)
        post_content = process_document_text("".join(
            tree.xpath('.//{}[contains(@class, "{}")]//text()'.format(
                self.post_page_selectors['content']['tag'],
                self.post_page_selectors['content']['selector'])))).strip()
        return post_content

    def create_inverted_index(self, content, document_id):
        inverted_index = {}
        forward_index = {
            index: word
            for index, word in enumerate(content.split())
        }
        for index, word in forward_index.items():
            word_normalized = normalize_word(word)
            inverted_index.setdefault(word_normalized, []).append(index)
        return inverted_index

    @staticmethod
    def dom_element_get_children(root, selector_data):
        return root.xpath('.//{}[contains(@class, "{}")]'.format(
            selector_data['tag'], selector_data['selector']))


if __name__ == '__main__':
    crawler = Crawler(load_config('conf/crawler.conf.json'))
    crawler.update_database()
Beispiel #14
0
        request.args.get('start') or timestamp_day_decrement())
    end_timestamp = int(request.args.get('end') or timestamp_today())
    word = normalize_word(request.args.get('word').lower())
    if not word:
        return jsonify({'message': 'word not specified'}), 400
    index_by_documents = mongo.filter_index_by_documents(
        start_timestamp, end_timestamp)
    documents_number = len(index_by_documents)
    documents_with_word = 0
    for document_index in index_by_documents:
        if word in document_index:
            documents_with_word += 1
    if documents_with_word == 0:
        return jsonify({'message': 'word does not appear in documents'}), 400
    idf = math.log10(documents_number / documents_with_word)
    return jsonify({'word': word, 'idf': idf})


if __name__ == '__main__':
    mongo = MongoConnector(load_config('conf/db.conf.json'))
    logger = Logger('logs/debug.log').log

    config = load_config('conf/app.conf.json')
    assert isinstance(config['port'], int)

    crawler = Crawler(load_config('conf/crawler.conf.json'))

    app = Flask(__name__)
    app.register_blueprint(api)
    app.run(host=config['host'], port=config['port'], debug=True)
Beispiel #15
0
            face_model_intepreter (tflite_runtime.interpreter): Instance of tflite_runtime.interpreter for face model
            telegram_people (Dict): Contains name:id pairs of all authorized telegram users
            telegram_token (String): The token needed to use the bot
        """
        self.ip_cam_objects = ip_cam_objects
        # self.tf_intepreter = tf_intepreter
        self.person_model = person_model
        self.face_model = face_model
        self.telegram_people = telegram_people
        self.telegram_ids = set(telegram_people.values())
        self.telegram_token = telegram_token


if __name__ == "__main__":
    # --- Load options from config ---
    config_dict = utils.load_config()

    ip_cams = config_dict["ip_cams"]  # load up list of ip_cams
    telegram_token = config_dict[
        "telegram_token"]  # load token for telegram bot
    people = config_dict["people"]  # Load up dict of people for telegram bot

    # Create the videostream objects for each ip cam
    ip_cam_objects = {
        ip_cam: VideoStream(ip_cams[ip_cam], ip_cam)
        for ip_cam in ip_cams
    }

    # Pre-load tf models
    person_model, face_model = tflite.load_models()
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser(
        prog='dbmanager.py',
        usage='%(prog)s [options]',
        description=
        '''tool for managing the conversion of raw data in different formats 
                                     into elasticsearch''',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '-c',
        '--config',
        dest='config',
        default="{}/{}".format('${PWD}', 'config.yml'),
        help='specify a config file path to use for the cli/daemon')
    parser.add_argument('-in',
                        '--index-names',
                        dest='in',
                        action='store_true',
                        help='index available names in data')
    parser.add_argument('-ip',
                        '--index-nutrients',
                        dest='ip',
                        action='store_true',
                        help='index available nutrients from data')
    parser.add_argument(
        '-rin',
        '--reindex-names',
        dest='rin',
        action='store_true',
        help=
        're-index available names in data and re-configure index. WARNING: WILL DELETE EXISTING '
        'INDEX')
    parser.add_argument(
        '-rip',
        '--reindex-nutrients',
        dest='rip',
        action='store_true',
        help=
        're-index available nutrients in data and re-configure index. WARNING: WILL DELETE EXISTING '
        'INDEX')
    parser.add_argument('-up',
                        '--upload-profiles',
                        dest='up',
                        action='store_true',
                        help='upload normalised food profiles into firebase')
    parser.add_argument(
        '-rt',
        '--remote-tunnel',
        dest='rt',
        action='store_true',
        default=False,
        help='fetch data from a remote mongodb instance via a SSH tunnel')
    parser.add_argument('-d',
                        '--display',
                        dest='display',
                        action='store_true',
                        help='display information about available data sets')

    args = vars(parser.parse_args())
    if 'PWD' in args['config']:
        config = utils.load_config(
            os.path.join(os.path.dirname(__file__), 'config', 'config.yml'))
    else:
        config = utils.load_config(args['config'])
    if not config:
        raise Exception("EXITING:FAILED_TO_LOAD_CONFIG")
    manager = DBManager(config, remote=args['rt'])

    if args['in']:
        manager.index_names()
    if args['ip']:
        manager.index_nutrients()
    if args['rip']:
        manager.reindex_nutrients()
    if args['rin']:
        manager.reindex_names()
    if args['display']:
        manager.display_information()
    if args['up']:
        manager.upload_foodprofiles()