コード例 #1
0
 def load_from_raw_json(self,
                        single_ticker=None,
                        force_update=False,
                        save=True):
     """
     :param single_ticker: str, ticker of the stock you need
     :param force_update: bool, force to fetch from remote
     :param save: bool whether to save to json file
     :return:
     """
     if type(single_ticker) is str:
         self._single_ticker_call = True
         if single_ticker.endswith("json"):
             single_ticker = single_ticker[:-5]
         _, df = get_data(ticker=single_ticker,
                          force_update=force_update,
                          save=save)
         self._dataframe = [df]
         return self
     json_path = os.path.join(
         os.path.join(os.getcwd(), DataPreprocessor._cache_folder_name),
         "*.json")
     self._dataframe = []
     for path in glob.glob(pathname=json_path, recursive=False):
         ticker = os.path.split(path)[-1][:-5].lower()
         _, df = get_data(ticker=ticker,
                          force_update=force_update,
                          save=save)
         if df is None:
             continue
         self._dataframe.append(df)
     return self
コード例 #2
0
def main():
    import time
    from data_extractor import get_data, corpus
    data = get_data(1)
    start = time.time()
    run_crawler(corpus, data)
    end = time.time()
    print
    print "Time Taken to Compute all Features : " + (str((end - start)/60.0)) + " minutes"
コード例 #3
0
def create_dataset(start_year=2007, end_year=2018):
    """
  Creates a dataset with the following properties
  x is a data matrix of size nxm where m is the number of features: feature_years * n_features
  n is the number of data vectors: n_municipalities * (n_years-feature_years+1)
  y is a vector of length n with the reported number of crimes predict_years after the end of a data vector
  """

    #Getting municipality data from 2007 until 2018
    data = get_data(aarstal=[str(x) for x in range(start_year, end_year + 1)],
                    noegletal=FEATURES)
    data = _fix_nans(data)
    data_target = get_data(
        aarstal=[str(x) for x in range(start_year, end_year + 1)],
        noegletal=[TARGET_FEATURE])
    data_target = _fix_nans(data_target).ravel()

    ordered = data_target.copy()
    ordered.sort()
    N = len(data_target)
    boundary = ordered[N // 2]

    #Create model data
    X = []
    y = []
    for i in range(len(data)):
        for j in range(data.shape[1] - 1):
            X.append(data[i, j].ravel())
    for i in range(len(X)):
        y.append(data_target[i] <= boundary)

    #Standardize
    X = np.array(X)
    y = np.array(y, dtype=np.bool)
    mean_X = X.mean(axis=0)
    std_X = X.std(axis=0) + EPS
    X = (X - mean_X) / std_X

    return X, y
コード例 #4
0
 def load_from_csv(self, csv_path: str):
     """
     :param csv_path: csv is a column of ticker strings
     :return:
     """
     self._dataframe = []
     with open(csv_path, "r") as fp:
         cr = csv.reader(fp)
         for line in cr:
             ticker = line[0]
             _, df = get_data(ticker=ticker, force_update=False, save=True)
             if df is None:
                 continue
             self._dataframe.append(df)
     return self
コード例 #5
0
ファイル: logistic_ann.py プロジェクト: wdmdev/ML_proj_2
def create_data_with_classes():
    '''
    Transformation of dataset adding new column with string labels for categories of each record

    Output: X and y with y being a collection of string labels
    '''
    X = np.array([])
    y = np.array([])

    #Features to use for the models
    feature_idcs = np.arange(len(FEATURES))
    target_feature_idx = FEATURES.index(TARGET_FEATURE)
    feature_idcs = np.delete(feature_idcs, target_feature_idx)

    #Getting municipality data from 2007 until 2018
    data = get_data(aarstal=[str(x) for x in range(2007, 2018 + 1)])
    data = fix_nans(data)

    #Create model data
    X = []
    y = []
    for i in range(len(data)):
        for j in range(len(data[0])):
            X.append(data[i, j, feature_idcs].ravel())
            y.append(data[i, j, target_feature_idx].ravel())

    #Standardize
    X = np.array(X)
    y = np.array(y)
    mean_X = X.mean(axis=0)
    std_X = X.std(axis=0) + EPS
    X = (X - mean_X) / std_X
    mean_y = y.mean()
    print('Mean of y: {}'.format(mean_y))
    std_y = y.std() + EPS
    print('Standard deviation of y: {}\n'.format(std_y))
    y = (y - mean_y) / std_y

    #Transform intervals into labels for categorization
    class_y = np.array([to_risk_cat(target) for target in y])
    classes, count = np.unique(class_y, return_counts=True)
    print('\nClass count after transformation:')
    for i in range(len(classes)):
        print('{0} risk: {1}'.format(classes[i], count[i]))

    print('\n')

    return Data(X, class_y, mean_X, mean_y, std_X, std_y)
コード例 #6
0
ファイル: ann.py プロジェクト: wdmdev/ML_proj_2
def create_dataset(feature_years=4,
                   predict_years=5,
                   start_year=2007,
                   end_year=2018):
    """
  Creates a dataset with the following properties
  x is a data matrix of size nxm where m is the number of features: feature_years * n_features
  n is the number of data vectors: n_municipalities * (n_years-feature_years+1)
  y is a vector of length n with the reported number of crimes predict_years after the end of a data vector
  """

    target_feature_idx = FEATURES.index(TARGET_FEATURE)
    feature_idcs = np.arange(len(FEATURES))
    # Comment out this line to include previous crime rates in features
    # feature_idcs = feature_idcs[feature_idcs!=target_feature_idx]

    data = get_data(aarstal=[str(x) for x in range(start_year, end_year + 1)])
    log("Estimating nans")
    data = fix_nans(data)
    x_total = data[:, :-predict_years, feature_idcs]
    y_total = data[:, predict_years:, target_feature_idx]

    x_new = []
    y_new = []
    # Loops through all years that are at the start of an observation
    for i in range(x_total.shape[1] - feature_years + 1):
        # Loops through all municipalities
        for j in range(x_total.shape[0]):
            x_new.append(x_total[j, i:i + feature_years, :].ravel())
            y_new.append(y_total[j, i + feature_years - 1])

    x = np.array(x_new)
    mean_x = x.mean(axis=0)
    std_x = x.std(axis=0) + EPS
    x = (x - mean_x) / std_x
    y = np.array(y_new)
    mean_y = y.mean()
    std_y = y.std() + EPS
    y = (y - mean_y) / std_y

    return Data(x, y, mean_x, mean_y, std_x, std_y)
コード例 #7
0
import data_extractor
import defines
import naive_bayes

# Set logger

logging.basicConfig(filename='output.log',level=logging.DEBUG)

#logging.info('This is an info log')
#logging.warning('This is a warning log')
#logging.error('This is an error log')

#*******************************************************************************
# DATA EXTRACTION
TRAIN_DATA = data_extractor.get_data(defines.DATA_TRAIN_CSV_FILE)
TEST_DATA = data_extractor.get_data(defines.DATA_TEST_CSV_FILE)

#*******************************************************************************
# STOP WORDS FILTER
#logging.info('Prepare Data')
#nb_lib.nb_lib_prepare(TRAIN_DATA)

#*******************************************************************************
# FEATURE SELECTION
logging.info('Feature selection')
#feature_select.get_selected_features(TRAIN_DATA)
#*******************************************************************************
# DERIVATION OF NAIVE BAYES CLASSIFIER 
logging.info('Naive Bayes Classifier')
コード例 #8
0
        all_ops.update(set(op))

    d = dict()
    for op in all_ops:
        d[op] = len(d)

    return d

def transform(ops, tr):
    ret = []
    for op in ops:
        ret.append(list(map(lambda x: tr[x], op)))
    return ret

if True:
    train_data = get_nice_data(get_data('reviews.json'))
    train_data = list(map(lambda x: np.array(x), train_data))

    scores = []
    for train_idx, test_idx in KFold(len(train_data[0]), n_folds=7, \
            shuffle=True):
        X_train = train_data[0][train_idx]
        Y_train = train_data[1][train_idx]

        X_test, Y_test = Solution._remove_differencies((train_data[0][test_idx],\
                train_data[1][test_idx]), True)

        sol = Solution(True)
        sol.train((X_train, Y_train))

        # sometimes it says "AttributeError: '_ConstantPredictor'
コード例 #9
0
import csv
from data_extractor import get_data

# get sp500 data, store json file in work dir, skip if json file already exists
if __name__ == "__main__":
    tickers = set()
    with open("pop_stk.csv", 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        for line in csv_reader:
            tickers.add(line[0])

    # with open("index_list.csv", 'r') as csv_file:
    #     csv_reader = csv.reader(csv_file)
    #     for line in csv_reader:
    #         tickers.add(line[0])

    for ticker in tickers:
        get_data(ticker=ticker, force_update=False, save=True)
コード例 #10
0
 def test_get_data(self):
     ticker, df = get_data(ticker="AAPL", force_update=True, save=False)
     self.assertTrue(df is not None)
     self.assertTrue(ticker is not None)
     self.assertNotEqual(len(df), 0)
コード例 #11
0
 def test_cache_write_and_read(self):
     ticker1, df1 = get_data(ticker="AAPL", force_update=True, save=True)
     ticker2, df2 = get_data(ticker="AAPL", force_update=False, save=False)
     self.assertEqual(ticker1, ticker2)
     self.assertEqual(len(df1), len(df2))