Beispiel #1
0
 def getData(self):
     file_names, params = self.gen_query(self.lat, self.lon, self.date)
     file_names.reverse()
     params.reverse()
     # print("F: " ,file_names)
     weather = []
     for file_name, param in zip(file_names, params):
         api_call = self.api_format + param
         resp = requests.get(api_call)
         d = DataExtractor(self.oID, resp.content)
         # print(d.maxPrecip(), ":", resp.content)
         # print(d.maxPrecip(), end = ' ')
         weather.append(str(d.maxPrecip()))
         # weather.extend([d.maxPrecip(), d.maxTemp(),    d.maxAirPressure(), d.maxHumidity(), d.maxWind()])
     # print()
     return weather
Beispiel #2
0
 def extract_class_data(self, raw_data):
     extracted_data = []
     for a_class in raw_data:
         a_class_data = DataExtractor(a_class)
         if a_class_data.class_name is not None:
             extracted_data.append(a_class_data)
     return extracted_data
Beispiel #3
0
    def decode(self, img, first_strip=False):
        softstrip_matrix = SoftstripMatrix(img, self.gray_img)
        header_extractor = HeaderExtractor(softstrip_matrix)
        header_extractor.parse_header()
        vertical_sync_start = header_extractor.vertical_sync_start
        self.bits_count = header_extractor.get_bits_per_row()

        if self.config['row_extractor'] == CNN_ROW_EXTRACTOR:
            row_extractor = CnnRowExtractor(softstrip_matrix.grayscale_matrix, softstrip_matrix.binary_matrix, self.bits_count)
            gray_grouped_matrix, grouped_matrix = row_extractor.extract_rows()
        else:
            row_extractor = AlgorithmicRowExtractor(softstrip_matrix, self.bits_count)
            grouped_matrix, gray_grouped_matrix = row_extractor.extract_rows()

        if self.config['row_decoder'] == CNN_ROW_DECODER:
            row_decoder = CnnRowDecoder(gray_grouped_matrix, self.start_time, self.bits_count, self.config['timeout'], vertical_sync_start)
            reduced_pixel_matrix = row_decoder.decode_rows()
        else:
            row_decoder = AlgorithmicRowDecoder(grouped_matrix, self.bits_count, self.start_time, self.config['timeout'])
            reduced_pixel_matrix = row_decoder.decode_rows()

        if len(reduced_pixel_matrix) == 0:
            print('[ERROR] ' + self.path + ' is invalid!')
        else:
            data_extractor = DataExtractor(self.config['timeout'])
            data_extractor.extract_data(reduced_pixel_matrix, first_strip, self.start_time)

            self.data += data_extractor.data

            
            if data_extractor.valid:
                print('Checksum valid!')
                if first_strip:
                    self.strip_meta_info = data_extractor.file_header
                    print(self.strip_meta_info)
            else:
                print('Checksum invalid!')
Beispiel #4
0
    aws_client = AWSClient(aws_access_key_id=aws_access_key_id,
                           aws_secret_access_key=aws_secret_access_key)

    #Constants for Data Extraction
    LOGIN_URL = 'https://www.lendingclub.com/auth/login'
    DOWNLOAD_URL = 'https://www.lendingclub.com/info/download-data.action'
    DIR_PATH = 'Data\DOWNLOAD_LOAN_DATA'

    #Parameters for lending club data scraping
    fileTag = "loanStatsFileNamesJS"
    # email=  os.environ['LENDING_CLUB_EMAIL']
    # password= os.environ['LENDING_CLUB_PASSWORD']

    print('Downloading Files...')
    #Extract Data from Lending Club URL
    de = DataExtractor(email, password)
    de.extractData(LOGIN_URL=LOGIN_URL,
                   DOWNLOAD_URL=DOWNLOAD_URL,
                   fileTag=fileTag)

    print('Ingesting Data...')
    #Ingest Data into Pipeline
    di = DataIngestor(aws_client)

    #Create Landing and Processed Buckets
    LANDING_BUCKET = 'lending-club-landing-data'
    PROCESSED_BUCKET = 'lending-club-processed-data'

    print('Creating Buckets...')
    di.createS3Bucket(LANDING_BUCKET)
    di.createS3Bucket(PROCESSED_BUCKET)
Beispiel #5
0
def get_arguments(argument_list):
    short_options = "d:"
    long_options = ["document="]
    try:
        document_file = ''
        arguments, values = getopt.getopt(argument_list, short_options,
                                          long_options)
        print(arguments)
        if len(arguments) < 1:
            print("Invalid arguments")
            sys.exit(2)
        for t in arguments:
            if t[0] in ("-d", "--document"):
                document_file = t[1]
                print(document_file)
        return document_file
    except getopt.error as err:
        print(str(err))
        sys.exit(2)


if __name__ == '__main__':
    wiki_10_file = get_arguments(sys.argv[1:])
    inverted_index = InvertedIndex()
    data_extractor = DataExtractor()
    structure_file_name = data_extractor.extract_data(wiki_10_file)
    index_file, vector_file = inverted_index.build_term_index(
        structure_file_name, wiki_10_file)
    print("Index file name: ", index_file)
    print("Vector file name: ", vector_file)
from DataExtractor import DataExtractor
from TextProcessing import TextProcessing
from TextClassifier import TextClassifier
import pandas as pd

train, target = DataExtractor('data.csv').data_producer('train')
test = DataExtractor('test.csv').data_producer('test')

text_classifier = TextClassifier()

(x_train, y_train), (x_validation,
                     y_validation) = text_classifier.split_validation_data(
                         train, target)

cleaned_x_train = TextProcessing(x_train).clean_text()
cleaned_x_validation = TextProcessing(x_validation).clean_text()
cleaned_x_test = TextProcessing(test).clean_text()

text_classifier.fit(cleaned_x_train, y_train)

text_classifier.evaluate(cleaned_x_validation, y_validation)

text_classifier.confusion(y_validation, cleaned_x_validation)

result = text_classifier.predict(cleaned_x_test)
pd.DataFrame(result, columns=['category']).to_csv('output.csv',
                                                  index=True,
                                                  index_label='index')
Beispiel #7
0
def main():

    total_start_time = time.time()

    # ------------------------------------------------------------------------ #
    # 0. PARSE INPUT ARGUMENTS
    # ------------------------------------------------------------------------ #

    data_file_name = "Crime_Weather_Cleaned_2017.csv"
    # data_file_name = "Crime20161718.csv"

    data_file_path = os.path.join(FOLDER_PATH, data_file_name)

    # ------------------------------------------------------------------------ #
    # 1. ESTABLISH DATABASE CONNECTION
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 1. DATABASE CONNECTION **** ")

    # host = 'localhost'
    # database = 'crime_star'
    # user = '******'
    # password = '******'
    # port = '3306'

    port = '3306'

    data_loader = DataLoader()

    ret = data_loader.connect(host=DB_IP,
                              database=DB,
                              user=DB_UNAME,
                              password=DB_PWD,
                              port=port)

    if ret != 1:
        print(" Connection not established. Try again")
        print(" Check internet connectivity")
        return ret

    # ------------------------------------------------------------------------ #
    # 2. DATA EXTRACTION PHASE
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 2. DATA EXTRACTION **** ")

    data_extractor = DataExtractor()
    data_frame = data_extractor.read_csv(fpath=data_file_path,
                                         nrows_to_read=-1)

    # ------------------------------------------------------------------------ #
    # 3. DATA LOADING PHASE
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 2. DATA LOADING **** ")

    ret = data_loader.load_full_table(data_frame, table_name=RAW_TABLE_NAME)

    if ret == -1:
        print(" Could not upload to database ")
        data_loader.disconnect()
        return

    print("Successfully populated database")

    # ------------------------------------------------------------------------ #
    # 4. DISCONNECT THE DATABASE AND CLEAN UP MEMORY
    # ------------------------------------------------------------------------ #

    data_loader.disconnect()

    # ------------------------------------------------------------------------ #
    # 5. SEND A MESSAGE TO THE DATA HUB AS AN UPDATE
    # ------------------------------------------------------------------------ #

    print(" Sending message to data hub for update....", end="")

    messenger = Messenger()
    # Connect to the data hub
    messenger.connect(host=DATA_HUB_IP, uname=DATA_HUB_UNAME, pwd=DATA_HUB_PWD)

    # Connect to the exchange
    messenger.connect_to_exchange(ex_name=EX_NAME)

    # Send update
    message = "Database updated with latest rows"
    messenger.send_message_to_exchange(ex_name=EX_NAME,
                                       message=message,
                                       topic=TOPIC)

    print("sent")

    total_end_time = time.time()

    print(" Total time taken :", total_end_time - total_start_time)
Beispiel #8
0
#!/usr/bin/python

import sys
from DataExtractor import DataExtractor

deExtractor = DataExtractor()
deExtractor.extractData(sys.argv[1])
            "dim_date": dim_date,
            "dim_weather": dim_weather,
            "dim_location": dim_location,
            "dim_crime": dim_crime,
        }
        return star_tables


if __name__ == "__main__":

    folder_path = "C:\\Users\\SSrih\\OneDrive\\UChicago\\DEP\\Project\\data" \
                  "\\Crime and Weather\\"
    # data_file_name = "CrimeWeather2010.csv"
    # data_file_name = "Crime2010Raw.csv"
    data_file_name = "Crime20161718.csv"

    data_file_path = os.path.join(folder_path, data_file_name)
    data_extractor = DataExtractor()
    data_frame = data_extractor.read_csv(fpath=data_file_path,
                                         nrows_to_read=5000)

    # print(data_frame.head())

    data_worker = DataWorker()

    print(data_frame.isnull().sum().sum())

    data_worker.process_pipeline(data_frame)

    print(data_frame.isnull().sum().sum())
Beispiel #10
0
#!/usr/bin/python

import sys
from DataExtractor import DataExtractor


deExtractor = DataExtractor()
deExtractor.extractData(sys.argv[1])
 def run(self):
     try:
         logInfo(self.userId, 'start crawling')
         de = DataExtractor()
         de.getServerId(self.userId)
         de.getWeightHistory(self.userId)
         de.getDietHistory(self.userId)
         de.getGroup(self.userId)
         de.getChallenge(self.userId)
         de.getBuddy(self.userId)
         logInfo(self.userId, 'Done crawling')
     except Exception as e:
         logException(self.userId, self.run.__name__, e)
from DataExtractor import DataExtractor
from Geolocation import Location
from DataFilter import DataFilter

import json

dataExObj = DataExtractor(dir='./dataset')
allFiles = dataExObj.FileList(ext='.json')

jsonFileDict = dataExObj.parse(allFiles)

#Data Filtering

#Step 1. Join reviews and business by business id's
datafilter = DataFilter()
data = datafilter.JoinByAttribute(jsonFileDict['business'],
                                  jsonFileDict['review'], 'business_id')

#Step 2: Get the location.

loc = Location()

for key, value in data.items():
    latitude = data[key]['latitude']
    longitude = data[key]['longitude']
    location = loc.GetLocation(latitude, longitude)
    data[key] = datafilter.merge_dicts(data[key], {'location': location})
    print(data[key]['location'])

#TODO Step 3
#Step 3: Filter out location outside United States.
Beispiel #13
0
from DataExtractor import DataExtractor
if __name__ == 'main':
    Extractor = DataExtractor()
    R = Extractor.Extract("realDonaldTrump")
    print(R)

Extractor = DataExtractor()
R = Extractor.Extract("realDonaldTrump")
print(R)
Beispiel #14
0
import os
from DataExtractor import DataExtractor
from StridesIdentifier import StridesIdentifier
from FeaturesExtractor import FeaturesExtractor
from ClassifiersEvaluator import ClassifiersEvaluator
from Train import Train
from InfluxCalculator import InfluxCalculator
from Classifier import Classifier

models_dir = '../models'
# if train models folder is empty
if len(os.listdir(models_dir)) == 0:
    # train phase
    #function to extract frames from videos and to create txt files with data of interest from each video
    de = DataExtractor('train')
    videos_list = de.extractor()
    #function to identifier strides from data (in txt files) about each video and to store strides informations in txt files
    si = StridesIdentifier(videos_list)
    si.identifier()
    #function to extract all features to analize about each person
    fe = FeaturesExtractor(videos_list)
    people_list = fe.extractor()
    #function to evaluate classifiers
    ce = ClassifiersEvaluator(people_list)
    ce.evaluator()
    #function to train classifiers
    t = Train(people_list)
    t.training()
#test phase
#function to extract frames from videos and to create txt files with data of interest from each video
de = DataExtractor('test')
# Clock to time what the running of the program
start_time = time.time()

# Setting up directory
mypath = os.getcwd() + '/data/seminar_testdata/test_untagged/'
trainingPath = 'data/training'
directory = os.fsencode(mypath)

# Runs the ontology classification
print('Running Ontology Classification: \n')
ontology = Ontology()
ontology.run(mypath)

# Begins tagging
print("\nTagging progress beginning. Get a brew, it'll take a while... \n")
extractor = DataExtractor()

# Trains our model
extractor.train(trainingPath)
tagger = Tagger()

# Tags all emails in the directory given
tagger.tag_seminar(mypath, directory, extractor)

# Calculates how long the program took
seconds = time.time() - start_time
m, s = divmod(seconds, 60)
print("The program has been running for {0} minutes and {1} seconds \n".format(
    round(m), round(s)))

# Evaluates results
Beispiel #16
0
from sklearn.linear_model import SGDClassifier
from sklearn import svm, preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from DataExtractor import DataExtractor

trainExtractor = DataExtractor(5)
trainData = trainExtractor.featureDictionary
for i in range(6, 10):
    trainData.update(DataExtractor(i).featureDictionary)

trainInput = list()
trainOutput = list()

for value in trainData.values():
    inputData = value[0]
    trainInput.append(inputData[0].values() + inputData[1].values())
    trainOutput.append(1 if value[1] == 1 else 0)

#clf = SGDClassifier(loss="log", penalty="elasticnet")
#clf = SGDClassifier(loss="hinge")
clf = svm.SVC()
#clf = GradientBoostingClassifier(n_estimators=30, max_depth=3, subsample=.7)
#clf = KNeighborsClassifier(n_neighbors=3)
scaledTrainInput = preprocessing.scale(trainInput)
clf.fit(trainInput, trainOutput)

testExtractor = DataExtractor(10)
testData = testExtractor.featureDictionary
for i in range(11, 13):
    testData.update(DataExtractor(i).featureDictionary)
 def __init__(self, filepath):
     # self.x_train, self.y_train = DataExtractor(filepath).split_labels()
     self.x_train, self.x_test, self.y_train, self.y_test = DataExtractor(
         filepath).split_validation()
Beispiel #18
0
from DataExtractor import DataExtractor
import sys
username = sys.argv[1]
KEY = sys.argv[2]
SECRET = sys.argv[3]
filters = ""
Extractor = DataExtractor()
Extractor.Extract(username, filters, KEY, SECRET)
Beispiel #19
0
from DataExtractor import DataExtractor

Extractor = DataExtractor()
Extractor.Extract()