def inscrawler_to_file():
    print("Starting process...")
    result = threaded_crawler()
    print("Inscrawler Done...")
    total, ignored, processed_result = preprocess(result)
    print("Processed Results...")
    print(str(total) + " total results, " + str(ignored) + " ignored results.")
    generate_folders(processed_result)
    print("Generated Folders for Influencers...")
    f = open("user.txt", "w+")
    f.write(json.dumps(processed_result))
    print("Saved Images to txt file")
    save_images(processed_result)
    print("Downloaded Images...")
    print("Done!")
Beispiel #2
0
    def __init__(self,
                 epochs: int = 500,
                 lr: float = 0.01,
                 dropout_chance: float = 0.5,
                 hiddensize: int = 32):
        self.epochs = epochs
        self.lr = lr
        self.dropout_chance = dropout_chance
        self.hiddensize = hiddensize

        self.train_x, self.test_x, self.train_y, self.test_y = preprocess()

        self.syn0 = 2 * np.random.random((self.hiddensize, 4)) - 1
        self.syn1 = 2 * np.random.random((3, self.hiddensize)) - 1
        self.b = 1

        self.losses = []
def open_and_extract():
    data = []
    files = [os.path.join(DIRECTORY, file) for file in os.listdir(DIRECTORY)]
    gt_file = pd.read_csv(GROUND_TRUTH_FILE, sep=',')
    for file in files:
        participant_id = file.split("/")[1].split("_")[0]
        y_train = get_gt_value(gt_file, participant_id)
        if y_train != -1:  # All the participants whose GT is Absent will not be considered.
            x_train = ""
            df = pd.read_csv(file, sep='\t')
            for index, row in df.iterrows():
                if row['speaker'] == PARTICIPANT:
                    value = row['value']
                    x_train = x_train + " " + value
            print("Data found Participant Id:" + participant_id)
            data.append([preprocess(x_train), y_train])
    data_df = pd.DataFrame(data, columns=[Participant_ID, PHQ8_Score])
    data_df.to_csv("Dev-Data", sep=',', index=False)
import numpy as np
from importlib import reload
import data_preprocessing as dp
import prepare_for_model as pfm
import other_functions as of
import explanatory_analysis as ea
import sklearn.linear_model as lr
from sklearn.metrics import log_loss, confusion_matrix
from sklearn import cross_validation as cv

reload(of)
reload(pfm)
reload(ea)
reload(dp)
data = pd.read_csv('train.csv')
dp.preprocess(data, is_prediction = False, path = 'values_to_keep')
data2 = pfm.prepare(data)
y = data.OutcomeType
timeInc = True
model_col = [c for c in data2.columns if not 'AnimalType_' in c \
            and (timeInc or (not 'Year_' in c and 
                             not 'Month_' in c and 
                             not 'Day_' in c and 
                             not 'DayOfWeek_' in c and
                             not 'Hour_' in c)) ]
## prepare model for dogs ###########################################
X_train, X_test, y_train, y_test = cv.train_test_split( 
    data2.loc[data2.AnimalType_Dog == 1,model_col], 
    y[data2.AnimalType_Dog == 1], 
    test_size=0.0, random_state=13)
# test of C parameters
Beispiel #5
0
            adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
        if row['actor_2_name'] in actors:
            actor_idx = actors.index(row['actor_2_name'])
            adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
            adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
        if row['actor_3_name'] in actors:
            actor_idx = actors.index(row['actor_3_name'])
            adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
            adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1

    # print(adjM)
    # print(adjM.shape)
    # import scipy.sparse
    # import os
    # scipy.sparse.save_npz(os.path.join("data/preprocess/IMDB/", "adjM.npz"), scipy.sparse.csr_matrix(adjM))
    # exit()

    metapath_class_num = 3
    metapath_class_list = [[(0, 1, 0), (0, 2, 0)], [(1, 0, 1),
                                                    (1, 0, 2, 0, 1)],
                           [(2, 0, 2), (2, 0, 1, 0, 2)]]

    savepath = "data/preprocess/IMDB/"
    preprocess(adjM=adjM,
               num_ntypes=metapath_class_num,
               type_mask=type_mask,
               expected_metapaths=metapath_class_list,
               dataoutput=savepath)
    end = time.time()
    print(end - begin)
Beispiel #6
0
product_quantity['product_month'] = product_quantity['product_date'].apply(
    lambda x: x[:7])
train_month = product_quantity.groupby(['product_id', 'product_month'
                                        ]).sum()['ciiquantity'].unstack()
total_null_index = set(product_info.index) - set(
    train_month.index)  #完全缺失值的index

#计算相似性矩阵
#data_sim = clt.CalculateSim(product_info)  #计算时间较长,可直接将datasim导入Variable explore
datasim = pd.read_csv("datasim.csv", index_col='Index')  #直接读入,用于填充缺失值
#缺失值填补(均值填补)
train_month = prep.fillnan(
    train_month, 1, 30, 140,
    datasim)  #对大于12个月有空缺值的基于相似度进行填补,其他的填补为140,取topK=30进行填补
#对product_info进行数据预处理
product_info = prep.preprocess(product_info)
#计算出每个产品的平均价格
product_quantity['price'].replace(to_replace=-1,
                                  method='backfill',
                                  inplace=True)
product_quantity['price'].replace(to_replace=-1, method='ffill', inplace=True)
product_price = product_quantity.groupby(['product_id']).mean()['price']

#对多个ID同时进行训练,采用xgboost
result_for_some = pd.DataFrame()
result_for_some, feature_importance = trainf.train_for_some(
    train_month, product_info, train_month.index, product_price, 'xgboost')
result_for_some = prep.to_positive(result_for_some.iloc[:, 23:])

#采用均值的方法
average = pd.DataFrame(train_month.iloc[:, 14:23].mean(axis=1),
 def __init__(self, K: int = 5):
     self.K = K
     self.train_x, self.test_x, self.train_y, self.test_y = preprocess(
         test_size=0.2)
table_name = 'os_reading_'
# 'AUB', 'Reception', 'Meeting', '4thFloor'
# AUB first date: 2018-02-02
# Reception first dat: 2018-02-27
# Meeting Room first date: 2018-02-27

# Env Data first date: 2018-02-01

project = 'AUB'

osdp = Opensensors('2018-02-27', '2018-05-05', table_name + project, project)
data = osdp.data

# SQLite database to pandas dataframe
conn = sqlite3.connect(table_name + project + ".sqlite")
data = preprocess(pd.read_sql_query("SELECT * FROM " + project, conn))
conn.close()

#heat = data.iloc[:, 5:]
out = Outliers(data)
out.plot(5)

g = General(data)
g.plot_comparison_bars()

g.period_plot('month', 'circulation')
g.period_plot('month', 'exhibition')
g.period_plot('month', 'ai')
g.period_plot('month', 'code')
g.period_plot('month', 'vr')
Beispiel #9
0
import pandas as pd
import numpy as np
import sklearn.neural_network.multilayer_perceptron
import os
import data_preprocessing as dp

df_dict = dp.rd.read_data_to_dict()
train_df = dp.preprocess(df_dict)

y = train_df.pop('Score')
X = train_df
Beispiel #10
0
import data_preprocessing
import numpy as np
import tensorflow as tf

#data processing
data1 = data_preprocessing.preprocess('three_samples.txt')
data2 = data_preprocessing.preprocess('full_samples.txt')
data3 = data_preprocessing.preprocess('null_samples.txt')
data = data1 + data2 + data3
data = np.array(data)
shuffle_indices = np.random.permutation(np.arange(len(data)))
shuffled_data = data[shuffle_indices]
train_sample_index = int( 0.8 * float(len(data)) )
training_data,test_data = shuffled_data[:train_sample_index], shuffled_data[train_sample_index:]
print('Number of training data samples: {}'.format(len(training_data)))
print('Number of test data samples: {}'.format(len(test_data)))
test_x, test_y = zip(*test_data)
test_x = np.array(test_x)
test_y = np.array(test_y)

#build graph
x = tf.placeholder(tf.float32, shape = [None, 49])
y_ = tf.placeholder(tf.float32, shape = [None, 77])

def weight_variable(shape):
    initial = tf.truncated_normal(shape,stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)
Beispiel #11
0
def transform_view():
    print("* Requesting data -- API")
    f = request.files['data_file']

    if not f:
        return ("No file selected. Please choose a CSV file and try again.")

    stream = io.StringIO(f.stream.read().decode("UTF8"), newline=None)
    print("stream", stream)
    csv_input = csv.reader(stream)
    print("csv input", csv_input)
    print("* Processing csv_input -- API")
    df = pd.DataFrame(csv_input, index=None, columns=None)

    data, df4, df4_column_names, df_normalized, df_normalized_w_target, X_test_new, y_test_new = preprocess(
        df)
    print('* Data Preprocessing Complete Flask -- API')

    print('* Joblib model loaded -- API')

    # X_train, X_test, y_train, y_test = sample_data(df_normalized_w_target)
    # print('* Data Sampled')
    # X_train, X_test_new, y_train, y_test_new = initialize_sample(df_normalized_w_target, X_test, y_test)
    # print("* Data Initialized for First Pickle")

    rfc_test_acc, y_pred, class_rept, conf_mat = hyper_param_rf_pickle(
        X_test_new, y_test_new, model)
    print("HyperParam Pickle Model", model)
    print("* Hyperparameter search complete -- API")
    y_test_new = (y_test_new.as_matrix(columns=None)).astype(int)
    print("Y Test New Type:", type(y_test_new))
    print(y_test_new)
    print("Y Pred:", type(y_pred))
    print(y_pred)
    conf_mat = confusion_matrix(y_test_new, y_pred)
    class_rept = classification_report(y_test_new, y_pred)
    print("* Conf Mat and Class Rept Defined -- API")

    print("* Saving results in an image....")

    # return('* CSV File Submitted -- Running API')
    # full_filename = os.path.join(app.config['UPLOAD_FOLDER'], 'confusion_matrix.png')
    return render_template("page.html",
                           matrix_image='./Static/confusion_matrix.png')