def main(args):
    ################ CONFIGURATIONS #################
    source_file = os.path.join(args.data_path, args.from_file_name)
    source_root_file_name = args.from_file_name.rpartition(os.path.sep)[-1].rpartition('.')[:-2][0]
    ################ ALGOS #################

    """
    ******************************************************************************************************************
    START: PARSING FILE
    ******************************************************************************************************************
    """
    tokenized_questions, tokenized_paragraphs, questions_nontokenized, paragraphs_nontokenized = UTIL.prepare_squad_objects(source_file, source_root_file_name)

    """
    ******************************************************************************************************************
    END: PARSING FILE
    ******************************************************************************************************************
    """

    """
    ******************************************************************************************************************
    START: SLIDING WINDOW
    ******************************************************************************************************************
    """
    get_slideed_tokenizations_and_dump(tokenized_questions, UTIL.create_dir(os.path.join(args.data_path, 'questions_windowed')), args.truncate_length, args.window_length)

    get_slideed_tokenizations_and_dump(tokenized_paragraphs, UTIL.create_dir(os.path.join(args.data_path, 'paragraphs_windowed')), args.truncate_length,
                                                               args.window_length)


    """
Example #2
0
def main(args):
    path = UTIL.create_dir(os.path.join(args.embedding_path, 'splitted_train_test'))
    test_question_embeddings, test_paragraph_embeddings, test_labels = load_data(args.embedding_path, args.label_path, 'test')
    dump_splitted_train_test(test_question_embeddings, test_paragraph_embeddings, test_labels, 'test', path, args.partition_size)
    print('Test data is ready')
    train_question_embeddings, train_paragraph_embeddings, train_labels = load_data(args.embedding_path, args.label_path,
                                                                              'train')
    dump_splitted_train_test(train_question_embeddings, train_paragraph_embeddings, train_labels, 'train', path, args.partition_size)
    print('Train data is ready')
    "batch_question": 250,
    "batch_paragraph": 20,
}

resource = titanX

is_dump_during_execution = False
is_inject_idf = True
is_filtered_by_answers_from_rnet = False

################ CONFIGURATIONS #################

_basepath = os.path.abspath(__file__).rpartition(os.sep)[0]
datadir = os.path.join(_basepath, dataset_type)

paragraphs_dir = UTIL.create_dir(os.path.join(datadir, 'ELMO', 'paragraphs'))
questions_dir = UTIL.create_dir(os.path.join(datadir, 'ELMO', 'questions'))

_paragraphs_file_name = '{}_paragraphs.txt'
paragraphs_file = os.path.join(paragraphs_dir, _paragraphs_file_name)

_questions_file_name = '{}_questions.txt'
questions_file = os.path.join(questions_dir, _questions_file_name)

_mapping_file_name = '{}_q_to_p_mappings.csv'
mapping_file = os.path.join(questions_dir, _mapping_file_name)

_paragraph_embeddings_file_name = '{}[email protected]'.format(
    dataset_type)
paragraph_embedding_file = os.path.join(paragraphs_dir,
                                        _paragraph_embeddings_file_name)
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker):
  A_df = pd.read_pickle(local_data_folder + ticker + '.pkl')
  A_df.dropna(inplace=True)
  A_df.drop(columns=["Date"], inplace=True)

  # Normalize
  scaler = MinMaxScaler()

  Y_df = pd.DataFrame(A_df["Label"]).astype('float64')
  X_df = A_df.drop(columns=["Label"]).astype('float64')

  X = scaler.fit_transform(X_df)
  Y = scaler.fit_transform(Y_df)

  # split data
  print("Splitting data")
  x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)

  # clustering
  s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
  print("Clustering")
  kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type="ml.m4.xlarge",
                output_path=s3_output_folder,
                k=3)

  kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values))

  # deploy
  print("Deploying model", kmeans.model_data)
  kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")


  create_dir('{}s3/{}'.format(local_data_folder, ticker))

  # upload train and test data to S3
  dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_train).astype("float32"),\
            clustering(x_train, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False)
  # prepare cluster data sets    
  create_dir('{}s3/{}/train'.format(local_data_folder, ticker))
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder)

  # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model
  dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_test).astype("float32"),\
            clustering(x_test, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False)
  # # prepare cluster data sets    
#   create_dir('{}s3/{}/test'.format(local_data_folder, ticker))
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder)

  # delete endpoint
  kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

  print('Completed clustering for', ticker)
Example #5
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import helper.utils as UTIL
TRAIN = 'train'
DEV = 'dev'

################ CONFIGURATIONS #################
dataset_type = DEV
is_dump_during_execution = False
is_inject_idf = False
is_tf_style = False
################ CONFIGURATIONS #################

_basepath = os.path.abspath(__file__).rpartition(os.sep)[0]
datadir = os.path.join(_basepath, dataset_type)

pre_trained_dir = UTIL.create_dir(os.path.join(_basepath, 'GLOVE', 'data'))
paragraphs_dir = UTIL.create_dir(os.path.join(datadir, 'GLOVE', 'paragraphs'))
questions_dir = UTIL.create_dir(os.path.join(datadir, 'GLOVE','questions'))

_paragraphs_file_name = '{}_paragraphs.txt'
paragraphs_file = os.path.join(paragraphs_dir, _paragraphs_file_name)

_questions_file_name = '{}_questions.txt'
questions_file = os.path.join(questions_dir, _questions_file_name)

_mapping_file_name = '{}_q_to_p_mappings.csv'
mapping_file = os.path.join(questions_dir, _mapping_file_name)

_paragraph_embeddings_file_name = '{}[email protected]'.format(dataset_type)
paragraph_embedding_file = os.path.join(paragraphs_dir, _paragraph_embeddings_file_name)