コード例 #1
0
    def train_models(self):
        MODELS_TO_TRAIN = [self.model_1_pre_process, self.model_2_pre_process]

        optimal_thetas = [[] for _ in xrange(len(MODELS_TO_TRAIN))]
        for i, model_preprocessor in enumerate(MODELS_TO_TRAIN):
            processor = DataProcessor()
            processor.load_input()
            processor.load_output()

            # add intercept term
            processor.input = np.insert(processor.input, 0, 1, 1)
            processor.input = processor.input.astype('float64')
            processor.input = model_preprocessor(processor.input)

            n = processor.input.shape[1] - 1

            theta_init = np.zeros(n + 1)
            theta_init = np.matrix(theta_init)
            theta_init = theta_init.transpose()
            processor.split_to_training_test(
                self.TRAINING_TEST_DATA_SPLIT_RATIO)

            X = processor.training_input
            for j in xrange(processor.num_labels):

                y = (processor.training_output == j)
                y = y.astype(int)

                model = LogisticRegression(theta_init, X, y)
                optimizer = GradientDescent(model)

                theta_optimal = optimizer.find_min()
                theta_optimal = theta_optimal.transpose()

                optimal_thetas[i].append(theta_optimal.tolist()[0])

        # theta indices i,j specify ith model and jth classification type
        with open(self.OUTPUT_FILE, "w") as f:
            for i, thetas in enumerate(optimal_thetas):
                for j, theta in enumerate(thetas):
                    theta_str = ','.join(map(str, theta))
                    f.write("theta_%d_%d=%s\n" % (i, j, theta_str))
コード例 #2
0
    def __init__(self,
                 start_date,
                 stop_date,
                 file_path,
                 c_logger=None,
                 data_processor=None):
        """
        Init method of 'JsonReportGenerator' class.
        :param start_date: The start date.
        :param stop_date: The end data.
        :param file_path: Path of the generated file.
        """

        self.start_date = start_date.replace(" ", "")
        self.stop_date = stop_date.replace(" ", "")
        self.file_path = file_path
        self.c_logger = c_logger if c_logger else self.__set_up_default_logger(
        )
        self.data_processor = (data_processor if data_processor else
                               DataProcessor(c_logger=self.c_logger))
コード例 #3
0
def TestVocabMapping():
    dataFile = "./dataset/samples/qa-dump-1460090355004_new.json"
    wordToIdFile = "./wordToId.json"
    idToWordFile = "./idToWord.json"
    dataProvider = DataProcessor(dataFile)
    dataProvider.BuildVocab()
    dataProvider.SaveVocab(wordToIdFile, idToWordFile)

    dataProvider.LoadVocab(wordToIdFile, idToWordFile)
    dataProvider.TranslateWordToIdPerArticle()
    data = dataProvider.data
    for title in data.keys():
        article = data[title]
        sentencesInId = article["textInSentencesInId"]
        sentencesInWordsFromId = dataProvider.TranslateIdToWord(sentencesInId)
        sentencesInWords = SentenceToWord(article["textInSentences"])
        for s0, s1 in zip(sentencesInWords, sentencesInWordsFromId):
            assert len(s0) == len(s1)
            for w0, w1 in zip(s0, s1):
                assert w0 == w1
    print "Vocab Mapping test passed!"
コード例 #4
0
ファイル: main.py プロジェクト: oleg-kazbeev/etl-test-task
def main():
    terminal_command = sys.argv[1:]

    terminal_parser = TerminalParser()
    terminal_parser.add_argument('-i', '--input', default=[], nargs='+')
    terminal_parser.add_argument('-o', '--output', default=[], nargs='+')

    input_files = terminal_parser.get_list_of_input_files(terminal_command)
    output_files = terminal_parser.get_list_of_output_files(terminal_command)

    data_processor = DataProcessor(input_files)
    file_with_min_col = data_processor.get_file_with_min_amount_of_columns()
    columns_of_result_file = data_processor.get_sorted_columns_of_result_file()

    data_composer = DataComposer(input_files, output_files)
    data_composer.record_first_file_content_into_basic_result_file(
        file_with_min_col, columns_of_result_file)
    data_composer.record_leftovers_files_into_basic_result(
        columns_of_result_file)
    data_composer.sort_basic_results_file_content()
    data_composer.record_advanced_results_based_on_basic()
コード例 #5
0
    def __init__(self, config):
        self.config = config
        self.sess_model_list = []
        self.graph_list = []
        self.signature_def_list = []
        self._read_sessions(self.config.predict.model_dirs,
                            self.config.predict.model_tag)
        if self.config.predict.cascade_model_dirs and self.config.predict.use_cascade_model:
            self._read_sessions(self.config.predict.cascade_model_dirs,
                                self.config.predict.model_tag)

        self.model_weights = []
        for model_weight in self.config.predict.model_weights:
            self.model_weights.append(float(model_weight))
        assert len(self.model_weights) == len(self.sess_model_list)

        self.data_processor = DataProcessor(config)
        self.data_processor.load_all_dict()

        self.feature_debug_file = codecs.open("feature_debug.txt",
                                              "w",
                                              encoding=util.CHARSET)
コード例 #6
0
ファイル: train.py プロジェクト: aguai23/medical_qa
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    knowledge_tree = KnowledgeTree(FLAGS.graph_path)
    data_processor = DataProcessor(FLAGS.data_path, knowledge_tree,
                                   FLAGS.max_sequence, FLAGS.max_entity)
    question_feature, entity_feature, labels = data_processor.get_training_samples(
    )

    train_numbers = len(question_feature)
    training_steps = int(train_numbers / FLAGS.train_batch_size *
                         FLAGS.train_epoch)

    input_fn = input_fn_builder(question_feature, entity_feature, labels)

    valid_question, valid_entity, valid_label = data_processor.get_valid_samples(
    )
    valid_numbers = len(valid_question)
    valid_steps = int(valid_numbers / FLAGS.train_batch_size)
    evaluate_fn = input_fn_builder(valid_question, valid_entity, valid_label)

    model_fn = model_fn_builder(hidden_size=256, fc_size=100, num_labels=2)

    config = tf.estimator.RunConfig(save_checkpoints_steps=300,
                                    log_step_count_steps=10,
                                    save_summary_steps=10,
                                    keep_checkpoint_max=10)
    estimator = tf.estimator.Estimator(model_dir=FLAGS.output_dir,
                                       model_fn=model_fn,
                                       config=config)

    train_spec = tf.estimator.TrainSpec(input_fn=input_fn,
                                        max_steps=training_steps)

    eval_spec = tf.estimator.EvalSpec(input_fn=evaluate_fn,
                                      steps=valid_steps,
                                      throttle_secs=10)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
コード例 #7
0
ファイル: model.py プロジェクト: Billy-Liu-12/NBTNGMA4ED-1
def train():
    # 数据集参数
    batch_size = 20
    # 句子长度
    step_num = 40
    data_processor = DataProcessor()
    train_data, test_data = data_processor.load_dataset(batch_size, step_num)
    # 模型参数
    # word embedding
    word_embed_dim = 100
    # vocab size
    n_words = data_processor.n_words
    # tags num
    num_tag = data_processor.num_tags
    types_embed_dim = 20
    subtypes_embed_dim = 20
    embed_path = "./data/100.utf8"
    # 定义模型
    initial_embed = data_processor.load_word2vec(embed_path, 100)
    model = Model(n_words, num_tag, initial_embed=initial_embed)
    # 定义优化器
    opti = tf.keras.optimizers.Adam(learning_rate=0.0001)
    start = None
    for epoch in range(5):
        for idx, batch in enumerate(train_data):
            if start is None:
                start = time.time()
            loss_sum, loss_mean = train_step(opti, batch, model)
            if (idx + 1) % 100 == 0:
                ends = time.time()
                cost = ends - start
                start = time.time()
                weights = model.get_weights()
                with open("./model/model.pkl", "wb") as fw:
                    pickle.dump(weights, fw)
                print(idx + 1, "---->", loss_mean.numpy(), "---> time cost: ",
                      cost)
                test_when_train(data_processor, model, test_data)
コード例 #8
0
	def _write(self, responces, write_clients):

		for client in write_clients:
			if client in responces:
				try:
					responce = responces[client]
					if responce != '':
						data_processor = DataProcessor()

						p_data = data_processor.make_parsed_data(responce)
						msg = ''
						msg = data_processor.form_message(p_data)
						if msg != '':
							client.send(msg)
						else:
							err = "ERROR: wrong client config or class description format"
					else:
						client.close()
						self._clients.remove(client)
				except:
					print('client %s %s disconnected.' % (client.fileno(), client.getpeername()))
					client.close()
					self._clients.remove(client)
コード例 #9
0
 def test_form_message(self):
     test_data_processor = DataProcessor()
     test_parsed_data = {
         "Class": {
             "Name":
             "User",
             "Initialization": [{}],
             "Methods": [{
                 "Method": "get_apples",
                 "Attributes": ["apples"]
             }, {
                 "Method": "give_apples",
                 "Attributes": ["apples"]
             }],
             "Attributes": ["apples"]
         }
     }
     test_message = {
         "Class":
         "class User(object):\n",
         "Init":
         "	def __init__(self):\n		self._apples = None\n\n",
         "Methods": [{
             "Method": "	def get_apples(self, apples):\n		pass\n\n"
         }, {
             "Method": "	def give_apples(self, apples):\n		pass\n\n"
         }, {
             "Method":
             "	def set_apples(self, apples):\n		self._apples = apples\n\n"
         }, {
             "Method":
             "	def get_apples(self):\n		return self._apples\n\n"
         }]
     }
     test_message = json.dumps(test_message)
     self.assertEqual(test_data_processor.form_message(test_parsed_data),
                      test_message.encode('utf-8'))
コード例 #10
0
    def __init__(
        self,
        main_window,
        c_logger=None,
        data_processor=None,
        graph_settings=None,
        graph_settings_file_path=None,
    ):
        """
        Init method of the 'MainWindow' class.
        :param main_window: Instance of the main Tk window.
        :param graph_settings: Instance of the graph settings parser.
        :param c_logger: Logger instance (ColoredLogger type is recommended).
                         Default is MAIN_LOGGER (Global variable.)
        :param data_processor: Instance of DataProcessor module.
        :param graph_settings_file_path: Path of the used graph settings config file.
        """

        self.c_logger = c_logger if c_logger else self.__set_up_default_logger(
        )
        self.main_window = main_window
        self.c_logger.info("Get main window: {}".format(main_window))

        self.c_logger.info("Creating DataProcessor instance.")
        self.data_processor = (data_processor if data_processor else
                               DataProcessor(c_logger=self.c_logger))
        self.c_logger.info("DataProcessor instance successfully created.")

        self.graph_settings_config_parser = graph_settings
        self.graph_settings_file_path = graph_settings_file_path

        self.graph_settings_top_level_window = None

        self.__create_new_record_gui_section()
        self.__create_visualisation_gui_section()

        self.__start_visualisation()
コード例 #11
0
 def test_parse_attributes(self):
     data = {
         "Attribute keywords": ["have ", "must have ", "has "],
         "Method keywords": ["can ", "should "],
         "Initialization keywords": {
             "Attribute keywords":
             ["is initializing by setting ", "by default get "],
             "Attribute values keywords": [" as ", " equal to ", " = "]
         }
     }
     description_config = DescriptionConfig(data)
     test_data_processor = DataProcessor()
     test_line1 = "Client have apples"
     test_line2 = "have apples, oranges and bananas"
     test_output1 = ["apples"]
     test_output2 = ["apples", "oranges", "bananas"]
     self.assertEqual(
         test_data_processor.parse_attributes(test_line1,
                                              description_config),
         test_output1)
     self.assertEqual(
         test_data_processor.parse_attributes(test_line2,
                                              description_config),
         test_output2)
コード例 #12
0
    def __init__(
        self, main_window, c_logger=None, data_processor=None,
    ):
        """
        Init method of the 'MetricsTab' class.
        :param main_window: Instance of the main Tk window.
        :param c_logger: Logger instance (ColoredLogger type is recommended).
                         Default is MAIN_LOGGER (Global variable.)
        :param data_processor: Instance of DataProcessor module.
        """

        super(MetricsTab, self).__init__()

        self.c_logger = c_logger if c_logger else self.__set_up_default_logger()
        self.main_window = main_window
        self.c_logger.info("Get main window: {}".format(main_window))
        self.c_logger.info("Creating DataProcessor instance.")
        self.data_processor = (
            data_processor if data_processor else DataProcessor(c_logger=self.c_logger)
        )
        self.c_logger.info("DataProcessor instance successfully created.")

        self.__generate_complete_gui()
        self.date_range = None
コード例 #13
0
    def generate_imgs(self, df, scales_path):
        global dict_images
        # Group share
        img_path = DataProcessor().get_group_share(df, df, fig_num)
        table_path = DataProcessor().get_group_share_table(df, df, fig_num)
        dict_images['group_share_chart'] = img_path
        dict_images['group_share_table'] = table_path

        # Client share
        img_path = DataProcessor().get_client_share(df, df, fig_num)
        table_path = DataProcessor().get_client_share_table(df, df, fig_num)
        dict_images['client_share_chart'] = img_path
        dict_images['client_share_table'] = table_path

        # Supplier share
        img_path = DataProcessor().get_supplier_share(
            df, df, fig_num)  #TODO change to supplier share
        table_path = DataProcessor().get_supplier_share_table(
            df, df, fig_num)  #TODO change to supplier table
        dict_images['supplier_share_chart'] = img_path
        dict_images['supplier_share_table'] = table_path
コード例 #14
0
ファイル: main.py プロジェクト: codebox/planetary-systems
import json
from data_source import DataSource
from data_processor import DataProcessor
from svg import Svg
from svg_wrapper import SvgWrapper
from field_names import *
from config import config

planet_data = DataSource().get()

star_data, maxima = DataProcessor(planet_data, config['star_count'],
                                  config['sort_order']).get_star_data()

svg_wrapper = SvgWrapper(Svg(), maxima)

# Save data used to generate SVG - for debugging purposes
if config['dump_data']:
    with open(config['dump_data_file'], 'w') as f:
        f.write(json.dumps(star_data, indent=4))

for star in star_data:
    svg_wrapper.add_star(star)

out_file = config['out_file']
svg_wrapper.save(out_file)
print('Render complete:', out_file)
コード例 #15
0
 def run(self):
     """Run the Scrape TMT articles script"""
     behaviour_df, behaviour_matrix, df_articles, df_users = DataProcessor(
     ).generate_reading_behaviour_matrix()
     ModelFitter(behaviour_df, behaviour_matrix, df_articles,
                 df_users).fit_model()
コード例 #16
0
def create_data_processor(measurement, options):
    """Factory function that can be used to switch to a specialized class
    depending on the options and the measurement(s) to be processed.
    """

    return DataProcessor(measurement, options)
コード例 #17
0
def train(config):

    # training and test configuration are basically the same
    config_test = copy.deepcopy(config)
    config_test.batch_size = 10
    config_test.seq_length = 1

    # process the training corpus (if not done yet) and return the training batches and other info
    train_data = DataProcessor(config.train_file,
                               config.batch_size,
                               config.seq_length,
                               True,
                               '<unk>',
                               history_size=1)
    test_data = DataProcessor(config_test.test_file,
                              config_test.batch_size,
                              config_test.seq_length,
                              False,
                              '<unk>',
                              history_size=1)

    config.vocab_size = train_data.vocab_size
    config_test.vocab_size = train_data.vocab_size

    # save the training configuration for future need
    if not os.path.isdir(config.save_dir):
        os.makedirs(config.save_dir)
    try:
        with open(os.path.join(config.save_dir, 'config.pkl'), 'wb') as f:
            cPickle.dump(config, f)
    except IOError:
        print("ERROR: Could not open and/or write the config file {}".format(
            os.path.join(config.save_dir, 'config.pkl')))

    with tf.Graph().as_default():

        # create the LM graph for training
        with tf.name_scope("Train"):
            with tf.variable_scope("Model", reuse=None):
                model_train = LM(config, True)

        # create the LM graph for testing with shared parameters
        with tf.name_scope("Test"):
            with tf.variable_scope("Model", reuse=True):
                model_test = LM(config_test, False)

        # run  the training/testing
        with tf.Session() as session:

            session.run(tf.global_variables_initializer())

            test_perplexity = model_test.run_model(session,
                                                   test_data,
                                                   eval_op=None,
                                                   verbosity=10000,
                                                   verbose=True)
            print("\n[INFO] Starting perplexity of test set: %.3f" %
                  test_perplexity)
            print('========================\n')

            # model saving manager
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

            # loop over all epochs
            for e in range(config.num_epochs):

                # we reset/define the epoch parameters
                lr_decay = config.decay_rate**max(e + 1 - config.max_epoch,
                                                  0.0)
                session.run(
                    tf.assign(model_train.lr, config.learning_rate * lr_decay))

                print("[INFO] Epoch: %d, Learning rate: %.3f \n" %
                      (e + 1, session.run(model_train.lr)))
                train_perplexity = model_train.run_model(
                    session,
                    train_data,
                    eval_op=model_train.train_op,
                    verbosity=50000,
                    verbose=True)

                test_perplexity = model_test.run_model(session, test_data)
                print(
                    "\n[SUMMARY] Epoch: {} | Train Perplexity: {:.3f} | Test Perplexity: {:.3f} \n"
                    .format(e + 1, train_perplexity, test_perplexity))
                print('========================')

                # save model after each epoch
                model_path = os.path.join(config.save_dir, 'model.ckpt')
                saver.save(session, model_path, global_step=(e + 1))

            # save the final model
            model_path = os.path.join(config.save_dir, 'model.ckpt')
            saver.save(session, model_path)
コード例 #18
0
                output, state = decoder_cell(input_seq, state)
                output = tf.layers.dense(output,
                                         self.vocab_size,
                                         activation=None,
                                         reuse=tf.AUTO_REUSE,
                                         name="to_vector")
                output = tf.argmax(tf.nn.softmax(output, axis=-1), axis=-1)
                # append to output
                outputs.append(output)
                input_seq = tf.nn.embedding_lookup(self.vocab_embedding,
                                                   output)
            outputs = tf.convert_to_tensor(outputs, dtype=tf.float32)
            outputs = tf.transpose(outputs, [1, 0])
        return outputs

    def build_cost(self, outputs):
        # target_label = tf.one_hot(self.answer_label, depth=self.vocab_size, dtype=tf.float32)
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=outputs, labels=self.answer_label)
        loss = tf.multiply(self.answer_mask, loss)
        loss = tf.reduce_mean(loss)
        return loss


if __name__ == "__main__":
    data_processor = DataProcessor("./data/QA_data/varicocele/",
                                   "./data/QA_data/varicocele/varicocele.json",
                                   word2vec="./data/word2vec/varicocele")
    seq2seq = Seq2Seq(data_processor.start_token,
                      data_processor.vocab_embedding)
コード例 #19
0
import os
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy.exc import IntegrityError
from fact_models import FactArtistByYear, FactGenreByYear, FactSongByYear
from data_processor import DataProcessor
from config import Session, dataSource

session = Session()

dirname = os.path.dirname(__file__)
ds = DataProcessor(os.path.join(dirname, dataSource))
ds.process()

for i, row in ds.artistsByYear().iterrows():
    record = FactArtistByYear(year=row[0], artist=row[1], titles=row[2])
    try:
        session.add(record)
        session.commit()
    except IntegrityError:
        print('FactArtistByYear Record exists for year {}'.format(row[0]))
        session.rollback()
query = session.query(FactArtistByYear)
print('{} records exist in FactArtistByYear'.format(query.count()))

for i, row in ds.genreByYear().iterrows():
    record = FactGenreByYear(year=row[0], genre=row[1], titles=row[2])
    try:
        session.add(record)
        session.commit()
    except IntegrityError:
コード例 #20
0
 def test_read_file_valid_file(self):
     file_path = "../input/Border_Crossing_Entry_Data.csv"
     dp = DataProcessor(input_file_name=file_path)
     self.assertIsNotNone(dp.data)
コード例 #21
0
ファイル: mlp.py プロジェクト: ovek/neural-network-finance
import tensorflow as tf
import prices as price
from data_processor import DataProcessor

start = "2003-01-01"
end = "2018-01-01"

price.get_price('AAPL', start, end)

process = DataProcessor("AAPL.csv", 0.9)
process.gen_test(10)
process.gen_train(10)

X_train = process.X_train / 200
Y_train = process.Y_train / 200

X_test = process.X_test / 200
Y_test = process.Y_test / 200

model = tf.keras.models.Sequential()
model.add(tf.layers.Dense(100, activation=tf.nn.relu))
model.add(tf.layers.Dense(100, activation=tf.nn.relu))
model.add(tf.layers.Dense(1, activation=tf.nn.relu))

model.compile(optimizer="adam", loss="mean_squared_error")

model.fit(X_train, Y_train, epochs=100)
print(model.evaluate(X_test, Y_test))
コード例 #22
0
import os
from db_handler import DBHandler
from flask import Flask, request
from data_processor import DataProcessor
from flaskthreads import AppContextThread

weather_app = Flask(__name__)
env_config = os.getenv("APP_SETTINGS", "config.DevelopmentConfig")
weather_app.config.from_object(env_config)
data_processor = DataProcessor(weather_app)
db_handler = DBHandler.get_instance(weather_app)
weather_app.app_context().push()


@weather_app.route('/', methods=['GET'])
def welcome_to_service():
    return "Welcome to my weather service!"


@weather_app.route('/pre_process', methods=['GET'])
def pre_process():
    thread = AppContextThread(target=data_processor.process_files)
    thread.start()
    return "Pre-processing csv files..."


@weather_app.route('/weather/data', methods=['GET'])
def get_data_by_location():
    data = request.get_json()
    try:
        if 'lon' in data and 'lat' in data:
コード例 #23
0
    with open(output_submit_file, "w") as writer:
        for i, pred in enumerate(predicts_all):
            json_d = {}
            json_d['id'] = i
            json_d['label'] = str(pred)
            writer.write(json.dumps(json_d) + '\n')
    print('inference over')


def parse_arguments(arg):
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default="sse",
                        help='Choose model to train.')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_arguments(sys.argv[1:])
    config = Config()
    print('导入词向量.....')
    data_processor = DataProcessor(config)
    executor = Executor(config)
    print('开始训练.....')
    model_name = args.model_name
    config.model_save_path = f'saveModel/{model_name}.pt'
    # 训练模型
    train(config, model_name, data_processor, executor)
    # 预测结果
    inference(config, model_name, data_processor, executor)
コード例 #24
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    data_processor = DataProcessor("./data/conll04_train.json",
                                   "./data/conll04_dev.json",
                                   "./data/conll04_test.json")
    (train_examples,
     dev_examples), vocabulary = data_processor.get_conll_examples(
         do_training=True)
    logger.info("Example format test")
    logger.info("Orig id: %d" % train_examples[0].orig_id)
    logger.info("Tokens: %s" % (" ".join(train_examples[0].tokens)))
    logger.info("Label: %s" % (" ".join(train_examples[0].label)))

    train_features, train_label = data_processor.convert_example_to_features(
        train_examples, vocabulary)
    dev_features, dev_label = data_processor.convert_example_to_features(
        dev_examples, vocabulary)
    weight_matrix = data_processor.build_lookup_matrix(vocabulary)

    train_data = TensorDataset(train_features, train_label)
    train_dataloader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  shuffle=True)
    train_batches = [batch for batch in train_dataloader]

    dev_data = TensorDataset(dev_features, dev_label)
    dev_dataloader = DataLoader(dev_data, batch_size=args.batch_size)

    eval_step = max(1, len(train_batches) // 5)

    if args.with_crf:
        logger.info("Running %s" % "BiLSTM+CRF")
        model = BiLSTM_CRF(weight_matrix, args.hidden_size,
                           data_processor.label_to_id, "<START>", "<STOP>")
    else:
        model = BiLSTM(weight_matrix, args.hidden_size, args.num_of_tags)
        logger.info("Running %s" % "BiLSTM")

    model.to(device)
    if n_gpu > 1 and not args.with_crf:
        model = torch.nn.DataParallel(model)

    optimizer = Adam(model.parameters(), lr=0.01, weight_decay=0.)

    tr_loss = 0
    tr_num_steps = 0
    max_score = 0.0
    start_time = time.time()
    for epoch in range(args.num_train_epochs):
        model.train()
        logger.info("Start epoch #{} (lr = {})...".format(epoch, 0.01))
        for step, batch in enumerate(train_batches):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_label = batch

            if args.with_crf:
                loss = model.neg_log_likelihood(input_ids, input_label)
            else:
                outputs = model(input_ids)
                loss = loss_fn(outputs, input_label)

            if n_gpu > 1:
                loss = loss.mean()
            tr_loss += loss.item()
            tr_num_steps += 1

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if (step + 1) % eval_step == 0:
                logger.info(
                    'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'
                    .format(epoch, step + 1, len(train_batches),
                            time.time() - start_time, tr_loss / tr_num_steps))
                save_model = False
                if args.do_eval:
                    score = evaluate(args, model, device, dev_label,
                                     dev_dataloader)
                    print("F1 score: %.6f" % score)
                    model.train()
                    if score > max_score:
                        max_score = score
                        save_model = True
                        logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.6f" %
                                    ("F1", str(0.01), epoch, score))
                else:
                    save_model = True
                if save_model:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = os.path.join(args.output_dir,
                                                     "pytorch_model.bin")
                    torch.save(model_to_save.state_dict(), output_model_file)
                    if max_score:
                        with open(
                                os.path.join(args.output_dir,
                                             "eval_results.txt"),
                                "w") as writer:
                            writer.write("Best eval result: F1 = %.4f" %
                                         max_score)
    if args.do_eval:
        [test_examples
         ], _ = data_processor.get_conll_examples(do_training=False)
        test_features, test_label = data_processor.convert_example_to_features(
            test_examples, vocabulary)
        test_data = TensorDataset(test_features, test_label)
        test_dataloader = DataLoader(test_data, batch_size=args.batch_size)

        if args.with_crf:
            model = BiLSTM_CRF(weight_matrix, args.hidden_size,
                               data_processor.label_to_id, "<START>", "<STOP>")
        else:
            model = BiLSTM(weight_matrix, args.hidden_size, args.num_of_tags)

        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "pytorch_model.bin")))
        model.eval()
        model = model.to(device)

        eval_result_file = os.path.join(args.output_dir, "eval_results.txt")
        if os.path.isfile(eval_result_file):
            with open(eval_result_file) as f:
                line = f.readline()
            logger.info(line)
            f.close()

        test_score = evaluate(args, model, device, test_label, test_dataloader)
        result = "test result: F1 = %.6f" % test_score
        logger.info(result)
コード例 #25
0
            print("----------epoch/epochs: {}/{}----------".format(
                epoch, epochs))
            print("Train Loss: {}, Train Acc: {}".format(
                train_loss, train_acc))
            val_acc = eval(model, loss_func, dev_loader)
            if val_acc >= best_val_acc:
                best_val_acc = val_acc
                best_model_params = copy.deepcopy(model.state_dict())

    model.load_state_dict(best_model_params)
    return model


if __name__ == "__main__":
    config = Config()
    processor = DataProcessor(config.data_path)
    train_examples = processor.get_train_examples(config.candidates_set_size)
    dev_examples = processor.get_dev_examples(config.candidates_set_size)

    train_dataset_tokens = processor.get_dataset_tokens(train_examples)
    dev_dataset_tokens = processor.get_dataset_tokens(dev_examples)

    if not os.path.exists(config.vocab_path) or config.update_vocab:
        processor.create_vocab(train_dataset_tokens, config.vocab_path)

    train_dataset_indices, vocab_size = processor.get_dataset_indices(
        train_dataset_tokens, config.vocab_path, config.vocab_size)
    dev_dataset_indices, _ = processor.get_dataset_indices(
        dev_dataset_tokens, config.vocab_path, config.vocab_size)
    config.vocab_size = vocab_size  # 实际词表大小
            
            loss_val += loss.item() * datas.size(0)
            
            #获取预测的最大概率出现的位置
            preds = torch.argmax(preds, dim=1)
            labels = torch.argmax(labels, dim=1)
            corrects += torch.sum(preds == labels).item()
        train_loss = loss_val / len(train_loader.dataset)
        train_acc = corrects / len(train_loader.dataset)
        if(epoch % 2 == 0):
            print("Train Loss: {}, Train Acc: {}".format(train_loss, train_acc))
            test_acc = test(model, test_loader, loss_func)
            if(best_val_acc < test_acc):
                best_val_acc = test_acc
                best_model_params = copy.deepcopy(model.state_dict())
    model.load_state_dict(best_model_params)
    return model

processor = DataProcessor()
train_datasets, test_datasets = processor.get_datasets(vocab_size=vocab_size, embedding_size=embedding_size, max_len=sentence_max_len)
train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_datasets, batch_size=batch_size, shuffle=True)

model = BiLSTMModel(embedding_size, hidden_size, num_layers, num_directions, num_classes)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_func = nn.BCELoss()
model = train(model, train_loader, test_loader, optimizer, loss_func, epochs)


コード例 #27
0
def main(c_logger=None):

    if not c_logger:
        # Set-up the main logger instance.
        path_of_log_file = os.path.join(PATH_OF_FILE_DIR, "..", "..", "logs",
                                        "main_log.log")
        c_logger = ColoredLogger(os.path.basename(__file__),
                                 log_file_path=path_of_log_file)
    if TEST_RUNNING:
        data_processor_instance = DataProcessor(config=TEST_CONFIG_FILE,
                                                c_logger=c_logger)
        graph_config_parser = set_up_graph_settings_config_parser(
            c_logger=c_logger, config_file=TEST_GRAPH_CONFIG_FILE)
        user_info_parser = set_up_user_info_config_parser(
            c_logger=c_logger, config_file=TEST_USER_INFO_CONFIG_FILE)
    else:
        data_processor_instance = DataProcessor(c_logger=c_logger)
        graph_config_parser = set_up_graph_settings_config_parser(
            c_logger=c_logger)
        user_info_parser = set_up_user_info_config_parser(c_logger=c_logger)

    window = tk.Tk()
    window.iconphoto(False, tk.PhotoImage(file=PATH_OF_WINDOW_ICON))
    window.title("Time reporting")

    # change ttk theme to 'clam' to fix issue with downarrow button
    style = ttk.Style()

    style.theme_create(
        "MyStyle",
        parent="alt",
        settings={
            "TNotebook": {
                "configure": {
                    "tabmargins": [2, 5, 2, 0]
                }
            },
            "TNotebook.Tab": {
                "configure": {
                    "padding": [50, 2]
                }
            },
        },
    )

    style.theme_use("MyStyle")
    note = ttk.Notebook(window)

    main_tab = tk.Frame(note)
    report_config_tab = tk.Frame(note)
    user_config_tab = tk.Frame(note)
    metrics_tab = tk.Frame(note)

    note.add(main_tab, text="Main")
    note.add(report_config_tab, text="Report")
    note.add(user_config_tab, text="User Config")
    note.add(metrics_tab, text="Metrics")

    note.pack(expand=True, fill=tk.BOTH)

    main_exit_button = tk.Button(
        window,
        width=30,
        text="EXIT",
        bg="grey60",
        activebackground="red",
        font="Helvetica 12 bold",
        command=lambda: quit_from_app(window),
    )

    main_exit_button.pack(fill=tk.X)

    main_tab_module.MainWindow(
        main_tab,
        c_logger=c_logger,
        data_processor=data_processor_instance,
        graph_settings=graph_config_parser,
        graph_settings_file_path=GRAPH_CONFIG_FILE,
    )

    report_tab_module.ReportConfigTab(report_config_tab,
                                      c_logger=c_logger,
                                      data_processor=data_processor_instance)

    user_tab_module.UserConfigTab(
        user_config_tab,
        c_logger=c_logger,
        user_info_parser=user_info_parser,
        user_info_config_file_path=USER_INFO_CONFIG_FILE,
    )

    metrics_tab_module.MetricsTab(
        metrics_tab,
        c_logger=c_logger,
        data_processor=data_processor_instance,
    )

    window.protocol("WM_DELETE_WINDOW", lambda: quit_from_app(window))

    window.mainloop()
コード例 #28
0
# This file is an interface to use the model to get recommendations.
# Run this file to get recommendations based on a tv show

import os
from data_processor import DataProcessor
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pickle
from sys import stdin
import logging

show_data_processor = DataProcessor()


def scrape_data():
    # delete any csv files that currently exist
    if (os.path.exists("data/tv.csv")):
        os.remove("data/tv.csv")

    if (os.path.exists("logging/metacritic_scraper.log")):
        os.remove("logging/metacritic_scraper.log")

    print('Scraping metacritic')
    # run metacritic scraper
    import metacritic_scraper

    # delete related csv file
    if (os.path.exists("data/tv_shows_with_features.csv")):
        os.remove("data/tv_shows_with_features.csv")
コード例 #29
0
def test_data_processor():
    num_obs = 2000
    data = pd.DataFrame(np.random.randn(num_obs).tolist(), columns=["Return"], index=[fake.date_time_between_dates(
        datetime_start=datetime(2020, 3, 13, 14, 58, 57), datetime_end=datetime(2020, 3, 20, 14, 58, 57), tzinfo=None)
        for x in range(num_obs)])
    # pp(data.Return['2020-03-13 19:55:49.743080':'2020-03-15 13:00:00.866140'])
    
    z = DataProcessor(data)(TimeFreqFilter(TimePeriod.MINUTE, 15))(rolling_mean, col_name="Return", n=5).data
    # pp(z.Return['2020-03-13 19:55:49.743080':'2020-03-15 13:00:00.866140'])
    
    z2 = DataProcessor(data)(TimeFreqFilter(TimePeriod.HOUR, 1))("between_time", '08:30', '16:30')(
        lambda x: x.rename(columns={"Return": "RETURN"})).data
    # pp(z2.head(5))
    # pp(z2.tail(5))
    
    z3 = DataProcessor(data)("between_time", '15:59', '16:30')(TimeFreqFilter(TimePeriod.BUSINESS_DAY))(
        lambda x: x[x.Return > 0.0])
    # pp(z3.head(5))
    # pp(z3.tail(5))
    
    z2 = DataProcessor(data).time_freq(TimePeriod.HOUR, 1). \
        between_time('08:30', '16:30').data
    # pp(z2.Return['2020-03-13 19:55:49.743080':'2020-03-15 13:00:00.866140'])
    
    z2 = DataProcessor(data) \
        (partial(lambda x, y, z: z.loc[x:y], '2020-03-13 08:00', '2020-03-17 08:00')) \
        ("between_time", '08:15', '16:30') \
        (lambda x: x[x.Return > 0.0]) \
        [TimeFreqFilter(TimePeriod.MINUTE, 5, starting=datetime(2017, 6, 1, 8, 15, 0)),
         [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std], "Return"] \
        (lambda x: x.rename(columns={'amax': 'HIGH', 'amin': 'LOW', 'mean': 'MEAN',
                                     'median': 'MEDIAN', 'first': 'OPEN', 'last': 'CLOSE', 'std': 'STD'})).data

    # pp(z2['2020-03-13 12:00':'2020-03-16 13:00'])
    # pp(z2.head(5).HIGH - z2.head(5).LOW)
    # pp(z2.columns.values)
    
    z3 = DataProcessor(data).between_time('11:30', '14:00').shift_to_new_column("L1_LOG_RET", "Return", 1).data
    # pp(z3.tail(5))
    
    z3 = DataProcessor(data).between_time('08:01', '18:30').time_freq(TimePeriod.BUSINESS_DAY).positive_column(
        value_column="Return").data
    # pp(z3.tail(5))
    
    z3 = DataProcessor(data).index('2020-03-13 19:55:49.743080', '2020-03-15 13:00:00.866140'). \
        between_time('08:15', '16:30').positive_column(value_column="Return"). \
        summarize_intervals(TimeFreqFilter(TimePeriod.MINUTE, 5, starting=datetime(2020, 3, 13, 19, 0, 0)),
                            [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std],
                            "Return"). \
        rename_columns(['amax', 'amin', 'mean', 'median', 'first', 'last', 'std'],
                       ['HIGH', 'LOW', 'MEAN', 'MEDIAN', 'OPEN', 'CLOSE', 'STD']).data
    
    # pp(z3.HIGH - z3.LOW)
    # pp(z3.tail(5))
    
    z2 = DataProcessor(data).index('2020-03-13 19:55', '2020-03-15 13:00'). \
        between_time('08:15', '16:30').positive_column(value_column="Return"). \
        summarize_intervals(TimeFreqFilter(TimePeriod.MINUTE, 30, starting=datetime(2020, 3, 14, 8, 0, 0)),
                            [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std],
                            "Return"). \
        rename_columns(['amax', 'amin', 'mean', 'median', 'first', 'last', 'std'],
                       ['HIGH', 'LOW', 'MEAN', 'MEDIAN', 'OPEN', 'CLOSE', 'STD'])(lambda x: x[~np.isnan(x.STD)]).data
    # pp(z2.tail(5))

    z2 = DataProcessor(data) \
        (partial(lambda x, y, z: z.loc[x:y], '2020-03-13 19:55', '2020-03-15 13:00')) \
        ("between_time", '08:15', '16:30') \
        (lambda x: x[x.Return > 0.0]) \
        [TimeFreqFilter(TimePeriod.MINUTE, 30, starting=datetime(2020, 3, 14, 8, 0, 0)),
         [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std], "Return"] \
        (lambda x: x.rename(columns={'amax': 'HIGH', 'amin': 'LOW', 'mean': 'MEAN',
                                     'median': 'MEDIAN', 'first': 'OPEN', 'last': 'CLOSE', 'std': 'STD'})) \
        (partial(duplicate_col, "MEAN", "LogReturn_MEAN")) \
        (partial(duplicate_col, "STD", "LogReturn_STD")) \
        (partial(shift_colname, 'LogReturn_MEAN', -1)) \
        (partial(shift_colname, 'LogReturn_STD', -1)) \
        (lambda x: x[~np.isnan(x.LogReturn_STD) & ~np.isnan(x.STD) & ~np.isnan(x.LogReturn_STD_F1)]).data
コード例 #30
0
from datetime import datetime, timedelta

utils = utilities()

logger = utils.formatLogger("BEGIN ETL PROCESS")

logger.info("BEGINNING ETL PROCESS")

end_of_period = None

if len(sys.argv) > 3:
    end_of_period = datetime.strptime(sys.argv[1], '%Y-%m-%d')
    logger.info("SETTING END OF PERIOD DATE - " + str(end_of_period))
    time.sleep(2)
    raw_file_path = sys.argv[2]
    logger.info("SETTING RAW DATA FILE PATH TO - " + str(raw_file_path))
    time.sleep(2)
    processed_file_path = sys.argv[3]
    logger.info("SETTING PROCESSED DATA FILE PATH TO - " +
                str(processed_file_path))
    time.sleep(2)
else:
    logger.error("ENTER END-OF-PERIOD & DATA FILE PATH")
    exit()

data_processor = DataProcessor(end_of_period, raw_file_path,
                               processed_file_path)

data_processor.process_staging_data()