Esempio n. 1
0
def main():
    logger = get_logger(LOGDIR)
    logger.info('start')

    logger.info('load faq data')
    qa_df = pd.read_csv(MUSCLE_QA)
    q_txt = qa_df['q_txt'].tolist()
    sep_q_txt = [to_sep_space(i) for i in q_txt]

    logger.info('load NN Language Model')
    embed = hub.Module(JA_NNLM_MODEL)
    embeddings = embed(sep_q_txt)

    logger.info('to vectors')
    vecs = execute(embeddings)
    logger.info('vector shape: {}'.format(vecs.shape))

    while True:
        text = six.moves.input('>> ')
        if text == '':
            break
        sep_input = to_sep_space(text)
        embeddings = embed([sep_input])
        vec = execute(embeddings)

        sort_i, sim = get_sim_index(vec, vecs)
        df = qa_df.loc[sort_i]
        show_sim_faq(df, sim)

    logger.info('end')
Esempio n. 2
0
def main():
    logger = get_logger(LOGDIR)
    logger.info('start')

    logger.info('load NN Language Model')
    embed = hub.Module(EN_NNLM_MODEL)

    qa_df = pd.read_csv(WIKIQA_DIR + '/WikiQA-test.tsv', sep='\t')
    maps = []
    mrrs = []
    for q_id in qa_df['QuestionID'].unique():
        df = qa_df[qa_df['QuestionID'] == q_id]
        if 1 not in df['Label'].unique():
            logger.debug('{0}: not answer'.format(q_id))
            continue
        q_doc = df['Question'].iloc[0].lower()
        embeddings = embed([q_doc])
        q_vec = execute(embeddings)
        a_docs = df['Sentence'].map(lambda x: x.lower()).tolist()
        embeddings = embed(a_docs)
        a_vecs = execute(embeddings)
        sort_i, sim = get_sim_index(q_vec, a_vecs)
        labels = [i for i, v in enumerate(df['Label']) if v == 1]
        rank = [i + 1 for i, v in enumerate(sort_i) if v in labels]
        _mrr = 1 / rank[0]
        _map = sum([1 / i for i in rank]) / len(rank)
        maps.append(_map)
        mrrs.append(_mrr)
        logger.info('{0}: MAP {1}, MRR {2}'.format(q_id, _map, _mrr))
    map_avg = sum(maps) / len(maps)
    mrr_avg = sum(mrrs) / len(mrrs)
    logger.info('MAP AVG {0} / MRR AVG {1}'.format(map_avg, mrr_avg))

    logger.info('end')
Esempio n. 3
0
def main():
    logger = get_logger(LOGDIR)
    logger.info('start')

    logger.info('1. Load the trained model.')
    ae = AutoEncoder.load(MUSCLE_MODEL)
    encoder = ae.get_encoder()

    logger.info('2. Load the corpus.')
    corpus = ReutersMuscleCorpus.load(MUSCLE_CORPUS)

    logger.info('3. Set the infer model.')
    infer = Infer(encoder, corpus)

    qa_df = pd.read_csv(MUSCLE_QA)
    q_txts = qa_df['q_txt'].tolist()
    vecs = np.array([infer(d) for d in q_txts])

    # 超回復とは
    # 夏までに痩せたい
    # 睡眠時間はどのくらいが良いですか?
    while True:
        text = six.moves.input('>> ')
        if text == '':
            break
        vec = infer(text)
        sort_i, sim = get_sim_index([vec], vecs)

        df = qa_df.loc[sort_i]
        show_sim_faq(df, sim)

    logger.info('end')
Esempio n. 4
0
def main():
    logger = get_logger(LOGDIR)
    logger.info('start')

    logger.info('1. Load Japanese word2vec embeddings.')
    embed_matrix, vocab = load_vectors(JAWIKI_MODEL)
    logger.info('embedding shape is {}'.format(embed_matrix.shape))

    logger.info('2. Prepare the corpus.')
    corpus = ReutersMuscleCorpus()
    corpus.build(embed_matrix, vocab, seq_size)
    corpus.save(MUSCLE_CORPUS)

    logger.info('3. Make autoencoder model.')
    ae = AutoEncoder(seq_size=seq_size,
                     embed_size=embed_matrix.shape[1],
                     latent_size=latent_size)
    ae.build()

    logger.info('4. Train model.')
    ae.model.compile(optimizer="adam", loss="mse")
    train_iter = corpus.batch_iter(batch_size)
    train_step = corpus.get_step_count(batch_size)
    valid_iter = corpus.batch_iter(batch_size)
    valid_step = corpus.get_step_count(batch_size)

    ae.model.fit_generator(train_iter,
                           train_step,
                           epochs=n_epoch,
                           validation_data=valid_iter,
                           validation_steps=valid_step,
                           callbacks=[
                               TensorBoard(log_dir=LOGDIR),
                               ModelCheckpoint(filepath=MUSCLE_MODEL,
                                               save_best_only=True)
                           ])

    logger.info('end')
Esempio n. 5
0
        while scroll_size > 0:
            "Scrolling..."
            response = self._es.scroll(scroll_id=sid, scroll='2m')
            # Process current batch of hits
            list_median = self.process_hits_update_dangerous_files(
                response['hits']['hits'], list_median)
            # Update the scroll ID
            sid = response['_scroll_id']
            # Get the number of results that returned in the last scroll
            scroll_size = len(response['hits']['hits'])

        # promedio(list_median)
        # moda(list_median)
        # media(list_median)
        self._logger.info(f"Moda: {stats.mode(list_median)}")
        self._logger.info(f"Media: {numpy.mean(list_median)}")
        self._logger.info(f"Mediana: {numpy.median(list_median)}")


if __name__ == '__main__':
    startTotal = timer()

    logger = functions.get_logger(True, 'elk')
    e = Elastic("127.0.0.1", logger)
    e.update_dangerous_files()

    endTotal = timer()
    logger.debug('Tiempo total: {} seg'.format(
        endTotal - startTotal))  # Time in seconds, e.g. 5.38
Esempio n. 6
0
def main():
    logger = get_logger(LOGDIR)
    logger.info('start')

    logger.info('1. Load WikiQA text')
    wikiqa_text = load_wikiqa()
    min_w = min([len(i.split()) for i in wikiqa_text])
    max_w = max([len(i.split()) for i in wikiqa_text])
    logger.info('{0} sentence, {1}-{2} words'.format(len(wikiqa_text), min_w, max_w))

    logger.info('2. Load GloVe embeddings.')
    embed_matrix, vocab = load_glove_vectors(GLOVE_MODEL, d=GLOVE_SIZE)
    logger.info('embedding shape is {}'.format(embed_matrix.shape))

    logger.info('3. Prepare the corpus.')
    corpus = ReutersMuscleCorpus()
    corpus.build(embed_matrix, vocab, seq_size)
    corpus.documents = wikiqa_text
    corpus.save(WIKIQA_CORPUS)

    logger.info('4. Make autoencoder model.')
    ae = AutoEncoder(seq_size=seq_size, embed_size=embed_matrix.shape[1], latent_size=latent_size)
    ae.build()

    logger.info('5. Train model.')
    ae.model.compile(optimizer="adam", loss="mse")
    train_iter = corpus.batch_iter(batch_size)
    train_step = corpus.get_step_count(batch_size)

    ae.model.fit_generator(
        train_iter,
        train_step,
        epochs=n_epoch,
        # validation_data=train_iter,
        # validation_steps=train_step,
        callbacks=[
            TensorBoard(log_dir=LOGDIR),
            ModelCheckpoint(filepath=WIKIQA_MODEL, save_best_only=True)
        ]
    )

    logger.info('6. Load the encoder.')
    encoder = ae.get_encoder()

    logger.info('7. Set the infer model.')
    infer = Infer(encoder, corpus)

    logger.info('8. Evaluate the model.')
    qa_df = pd.read_csv(WIKIQA_DIR + '/WikiQA-test.tsv', sep='\t')
    maps = []
    mrrs = []
    for q_id in qa_df['QuestionID'].unique():
        df = qa_df[qa_df['QuestionID'] == q_id]
        if 1 not in df['Label'].unique():
            logger.debug('{0}: not answer'.format(q_id))
            continue
        q_doc = df['Question'].iloc[0].lower()
        q_vec = infer(q_doc)
        a_docs = df['Sentence'].map(lambda x: x.lower()).tolist()
        a_vecs = [infer(d) for d in a_docs]
        sort_i, sim = get_sim_index([q_vec], a_vecs)
        labels = [i for i, v in enumerate(df['Label']) if v == 1]
        rank = [i + 1 for i, v in enumerate(sort_i) if v in labels]
        _mrr = 1 / rank[0]
        _map = sum([1 / i for i in rank]) / len(rank)
        maps.append(_map)
        mrrs.append(_mrr)
        logger.info('{0}: MAP {1}, MRR {2}'.format(q_id, _map, _mrr))
    map_avg = sum(maps) / len(maps)
    mrr_avg = sum(mrrs) / len(mrrs)
    logger.info('MAP AVG {0} / MRR AVG {1}'.format(map_avg, mrr_avg))

    logger.info('end')
Esempio n. 7
0
    end_sample_date = user_input_config.end_sample_date

    # Decide which configuration
    with open(config_file, 'r') as stream:
        config = yaml.load(stream)
    # pprint.pprint(config)
    tz = config.get('tz')
    datetime_standard_format = config.get('datetime_standard_format')
    analysis_type = config.get('analysis_type')

    spark_config = config.get('SPARK', "")
    mssql_config = config.get('MSSQL', "")
    filter_config = config.get('FILTER', "")
    chronic_config = config.get('CHRONIC', "")
    logger = get_logger(tz=tz,
                        identifier=str(fab) + analysis_type,
                        debug=verbose)

    logger.info("=" * 40 + " script starts " + "=" * 40)
    logger.info("Analysis type is " + analysis_type)

    if 'rda' in analysis_type:
        area_list = filter_config.get('area')
        folder_list = filter_config.get('folder')
        query_ooc_only_flag = filter_config.get('query_ooc_only_flag')
        vio_type_list_csv = filter_config.get('vio_type_list_csv')
        cutoff_hour = int(filter_config.get('cutoff_hour'))
        query_interval_in_seconds = int(
            filter_config.get('query_interval_in_seconds'))
        buffer_seconds = int(filter_config.get('buffer_seconds'))
        latest_ooc_min_count = int(filter_config.get('latest_ooc_min_count'))
Esempio n. 8
0
                           action='store_true',
                           help='Verbose flag (boolean).',
                           default=False)

    # tambien lo puedo poner en la misma linea
    my_parser.set_defaults(ip=config['DEFAULTS']['ELASTIC_IP'])
    my_parser.set_defaults(index=config['DEFAULTS']['ELASTIC_INDEX'])
    # myParser.print_help()
    return my_parser.parse_args()


if __name__ == '__main__':
    startTotal = timer()

    arg = create_arg()
    logger = functions.get_logger(arg.verbose, 'elk')

    e = Elastic(arg.ip, logger)

    if arg.mapping is not None:
        e.create_mapping(arg.index, arg.mapping)

    if arg.update:
        # Paso 1 crear json descargas y obtener el hash de cada uno
        e.create_json_downloads_pending(
            just_download=True)  # creo json de wget y curl que no existan
        e.create_json_downloads_pending(
            just_download=False)  # creo json de wget y curl que no existan

        # Paso 2 obtener la peligrodisdad de cada hash
        e.update_dangerous_files()
    parser.add_argument('--config', action="store", dest='config', required=True, help='configuration file')
    parser.add_argument('--debug', action="store_true", dest='debug', required=False, default=False,
                        help='disable or enable debug mode in logging. ')

    user_input_config = parser.parse_args()
    fab = user_input_config.fab
    config_file = user_input_config.config
    verbose = user_input_config.debug


    # Decide which configuration
    with open(config_file, 'r') as stream:
        config = yaml.load(stream)
    # pprint.pprint(config)
    tz = config.get('tz')
    logger = get_logger(tz, debug=verbose)
    logger.info("####################"*4)
    logger.info("=" * 40 + " script starts " + "=" * 40)
    datetime_standard_format = config.get('datetime_standard_format')

    mssql_config = config.get('MSSQL', "")
    teradata_config = config.get('TERADATA', "")
    teradata_server = teradata_config.get('server')
    teradata_user = teradata_config.get('user')
    teradata_password = base64.b64decode(teradata_config.get('password'))
    td = TeradataUtil(server=teradata_server, user=teradata_user, password=teradata_password)
    mssql_server = mssql_config.get('server')
    mssql_user = mssql_config.get('user')
    mssql_password = base64.b64decode(mssql_config.get('password'))
    mssql_database = mssql_config.get('database')
    mssql_port = mssql_config.get('port')