def main(): logger = get_logger(LOGDIR) logger.info('start') logger.info('load faq data') qa_df = pd.read_csv(MUSCLE_QA) q_txt = qa_df['q_txt'].tolist() sep_q_txt = [to_sep_space(i) for i in q_txt] logger.info('load NN Language Model') embed = hub.Module(JA_NNLM_MODEL) embeddings = embed(sep_q_txt) logger.info('to vectors') vecs = execute(embeddings) logger.info('vector shape: {}'.format(vecs.shape)) while True: text = six.moves.input('>> ') if text == '': break sep_input = to_sep_space(text) embeddings = embed([sep_input]) vec = execute(embeddings) sort_i, sim = get_sim_index(vec, vecs) df = qa_df.loc[sort_i] show_sim_faq(df, sim) logger.info('end')
def main(): logger = get_logger(LOGDIR) logger.info('start') logger.info('load NN Language Model') embed = hub.Module(EN_NNLM_MODEL) qa_df = pd.read_csv(WIKIQA_DIR + '/WikiQA-test.tsv', sep='\t') maps = [] mrrs = [] for q_id in qa_df['QuestionID'].unique(): df = qa_df[qa_df['QuestionID'] == q_id] if 1 not in df['Label'].unique(): logger.debug('{0}: not answer'.format(q_id)) continue q_doc = df['Question'].iloc[0].lower() embeddings = embed([q_doc]) q_vec = execute(embeddings) a_docs = df['Sentence'].map(lambda x: x.lower()).tolist() embeddings = embed(a_docs) a_vecs = execute(embeddings) sort_i, sim = get_sim_index(q_vec, a_vecs) labels = [i for i, v in enumerate(df['Label']) if v == 1] rank = [i + 1 for i, v in enumerate(sort_i) if v in labels] _mrr = 1 / rank[0] _map = sum([1 / i for i in rank]) / len(rank) maps.append(_map) mrrs.append(_mrr) logger.info('{0}: MAP {1}, MRR {2}'.format(q_id, _map, _mrr)) map_avg = sum(maps) / len(maps) mrr_avg = sum(mrrs) / len(mrrs) logger.info('MAP AVG {0} / MRR AVG {1}'.format(map_avg, mrr_avg)) logger.info('end')
def main(): logger = get_logger(LOGDIR) logger.info('start') logger.info('1. Load the trained model.') ae = AutoEncoder.load(MUSCLE_MODEL) encoder = ae.get_encoder() logger.info('2. Load the corpus.') corpus = ReutersMuscleCorpus.load(MUSCLE_CORPUS) logger.info('3. Set the infer model.') infer = Infer(encoder, corpus) qa_df = pd.read_csv(MUSCLE_QA) q_txts = qa_df['q_txt'].tolist() vecs = np.array([infer(d) for d in q_txts]) # 超回復とは # 夏までに痩せたい # 睡眠時間はどのくらいが良いですか? while True: text = six.moves.input('>> ') if text == '': break vec = infer(text) sort_i, sim = get_sim_index([vec], vecs) df = qa_df.loc[sort_i] show_sim_faq(df, sim) logger.info('end')
def main(): logger = get_logger(LOGDIR) logger.info('start') logger.info('1. Load Japanese word2vec embeddings.') embed_matrix, vocab = load_vectors(JAWIKI_MODEL) logger.info('embedding shape is {}'.format(embed_matrix.shape)) logger.info('2. Prepare the corpus.') corpus = ReutersMuscleCorpus() corpus.build(embed_matrix, vocab, seq_size) corpus.save(MUSCLE_CORPUS) logger.info('3. Make autoencoder model.') ae = AutoEncoder(seq_size=seq_size, embed_size=embed_matrix.shape[1], latent_size=latent_size) ae.build() logger.info('4. Train model.') ae.model.compile(optimizer="adam", loss="mse") train_iter = corpus.batch_iter(batch_size) train_step = corpus.get_step_count(batch_size) valid_iter = corpus.batch_iter(batch_size) valid_step = corpus.get_step_count(batch_size) ae.model.fit_generator(train_iter, train_step, epochs=n_epoch, validation_data=valid_iter, validation_steps=valid_step, callbacks=[ TensorBoard(log_dir=LOGDIR), ModelCheckpoint(filepath=MUSCLE_MODEL, save_best_only=True) ]) logger.info('end')
while scroll_size > 0: "Scrolling..." response = self._es.scroll(scroll_id=sid, scroll='2m') # Process current batch of hits list_median = self.process_hits_update_dangerous_files( response['hits']['hits'], list_median) # Update the scroll ID sid = response['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(response['hits']['hits']) # promedio(list_median) # moda(list_median) # media(list_median) self._logger.info(f"Moda: {stats.mode(list_median)}") self._logger.info(f"Media: {numpy.mean(list_median)}") self._logger.info(f"Mediana: {numpy.median(list_median)}") if __name__ == '__main__': startTotal = timer() logger = functions.get_logger(True, 'elk') e = Elastic("127.0.0.1", logger) e.update_dangerous_files() endTotal = timer() logger.debug('Tiempo total: {} seg'.format( endTotal - startTotal)) # Time in seconds, e.g. 5.38
def main(): logger = get_logger(LOGDIR) logger.info('start') logger.info('1. Load WikiQA text') wikiqa_text = load_wikiqa() min_w = min([len(i.split()) for i in wikiqa_text]) max_w = max([len(i.split()) for i in wikiqa_text]) logger.info('{0} sentence, {1}-{2} words'.format(len(wikiqa_text), min_w, max_w)) logger.info('2. Load GloVe embeddings.') embed_matrix, vocab = load_glove_vectors(GLOVE_MODEL, d=GLOVE_SIZE) logger.info('embedding shape is {}'.format(embed_matrix.shape)) logger.info('3. Prepare the corpus.') corpus = ReutersMuscleCorpus() corpus.build(embed_matrix, vocab, seq_size) corpus.documents = wikiqa_text corpus.save(WIKIQA_CORPUS) logger.info('4. Make autoencoder model.') ae = AutoEncoder(seq_size=seq_size, embed_size=embed_matrix.shape[1], latent_size=latent_size) ae.build() logger.info('5. Train model.') ae.model.compile(optimizer="adam", loss="mse") train_iter = corpus.batch_iter(batch_size) train_step = corpus.get_step_count(batch_size) ae.model.fit_generator( train_iter, train_step, epochs=n_epoch, # validation_data=train_iter, # validation_steps=train_step, callbacks=[ TensorBoard(log_dir=LOGDIR), ModelCheckpoint(filepath=WIKIQA_MODEL, save_best_only=True) ] ) logger.info('6. Load the encoder.') encoder = ae.get_encoder() logger.info('7. Set the infer model.') infer = Infer(encoder, corpus) logger.info('8. Evaluate the model.') qa_df = pd.read_csv(WIKIQA_DIR + '/WikiQA-test.tsv', sep='\t') maps = [] mrrs = [] for q_id in qa_df['QuestionID'].unique(): df = qa_df[qa_df['QuestionID'] == q_id] if 1 not in df['Label'].unique(): logger.debug('{0}: not answer'.format(q_id)) continue q_doc = df['Question'].iloc[0].lower() q_vec = infer(q_doc) a_docs = df['Sentence'].map(lambda x: x.lower()).tolist() a_vecs = [infer(d) for d in a_docs] sort_i, sim = get_sim_index([q_vec], a_vecs) labels = [i for i, v in enumerate(df['Label']) if v == 1] rank = [i + 1 for i, v in enumerate(sort_i) if v in labels] _mrr = 1 / rank[0] _map = sum([1 / i for i in rank]) / len(rank) maps.append(_map) mrrs.append(_mrr) logger.info('{0}: MAP {1}, MRR {2}'.format(q_id, _map, _mrr)) map_avg = sum(maps) / len(maps) mrr_avg = sum(mrrs) / len(mrrs) logger.info('MAP AVG {0} / MRR AVG {1}'.format(map_avg, mrr_avg)) logger.info('end')
end_sample_date = user_input_config.end_sample_date # Decide which configuration with open(config_file, 'r') as stream: config = yaml.load(stream) # pprint.pprint(config) tz = config.get('tz') datetime_standard_format = config.get('datetime_standard_format') analysis_type = config.get('analysis_type') spark_config = config.get('SPARK', "") mssql_config = config.get('MSSQL', "") filter_config = config.get('FILTER', "") chronic_config = config.get('CHRONIC', "") logger = get_logger(tz=tz, identifier=str(fab) + analysis_type, debug=verbose) logger.info("=" * 40 + " script starts " + "=" * 40) logger.info("Analysis type is " + analysis_type) if 'rda' in analysis_type: area_list = filter_config.get('area') folder_list = filter_config.get('folder') query_ooc_only_flag = filter_config.get('query_ooc_only_flag') vio_type_list_csv = filter_config.get('vio_type_list_csv') cutoff_hour = int(filter_config.get('cutoff_hour')) query_interval_in_seconds = int( filter_config.get('query_interval_in_seconds')) buffer_seconds = int(filter_config.get('buffer_seconds')) latest_ooc_min_count = int(filter_config.get('latest_ooc_min_count'))
action='store_true', help='Verbose flag (boolean).', default=False) # tambien lo puedo poner en la misma linea my_parser.set_defaults(ip=config['DEFAULTS']['ELASTIC_IP']) my_parser.set_defaults(index=config['DEFAULTS']['ELASTIC_INDEX']) # myParser.print_help() return my_parser.parse_args() if __name__ == '__main__': startTotal = timer() arg = create_arg() logger = functions.get_logger(arg.verbose, 'elk') e = Elastic(arg.ip, logger) if arg.mapping is not None: e.create_mapping(arg.index, arg.mapping) if arg.update: # Paso 1 crear json descargas y obtener el hash de cada uno e.create_json_downloads_pending( just_download=True) # creo json de wget y curl que no existan e.create_json_downloads_pending( just_download=False) # creo json de wget y curl que no existan # Paso 2 obtener la peligrodisdad de cada hash e.update_dangerous_files()
parser.add_argument('--config', action="store", dest='config', required=True, help='configuration file') parser.add_argument('--debug', action="store_true", dest='debug', required=False, default=False, help='disable or enable debug mode in logging. ') user_input_config = parser.parse_args() fab = user_input_config.fab config_file = user_input_config.config verbose = user_input_config.debug # Decide which configuration with open(config_file, 'r') as stream: config = yaml.load(stream) # pprint.pprint(config) tz = config.get('tz') logger = get_logger(tz, debug=verbose) logger.info("####################"*4) logger.info("=" * 40 + " script starts " + "=" * 40) datetime_standard_format = config.get('datetime_standard_format') mssql_config = config.get('MSSQL', "") teradata_config = config.get('TERADATA', "") teradata_server = teradata_config.get('server') teradata_user = teradata_config.get('user') teradata_password = base64.b64decode(teradata_config.get('password')) td = TeradataUtil(server=teradata_server, user=teradata_user, password=teradata_password) mssql_server = mssql_config.get('server') mssql_user = mssql_config.get('user') mssql_password = base64.b64decode(mssql_config.get('password')) mssql_database = mssql_config.get('database') mssql_port = mssql_config.get('port')