コード例 #1
0
ファイル: Collector.py プロジェクト: KanishkGar/GFFS
 def getData(self):
     file_names, params = self.gen_query(self.lat, self.lon, self.date)
     file_names.reverse()
     params.reverse()
     # print("F: " ,file_names)
     weather = []
     for file_name, param in zip(file_names, params):
         api_call = self.api_format + param
         resp = requests.get(api_call)
         d = DataExtractor(self.oID, resp.content)
         # print(d.maxPrecip(), ":", resp.content)
         # print(d.maxPrecip(), end = ' ')
         weather.append(str(d.maxPrecip()))
         # weather.extend([d.maxPrecip(), d.maxTemp(),    d.maxAirPressure(), d.maxHumidity(), d.maxWind()])
     # print()
     return weather
コード例 #2
0
 def extract_class_data(self, raw_data):
     extracted_data = []
     for a_class in raw_data:
         a_class_data = DataExtractor(a_class)
         if a_class_data.class_name is not None:
             extracted_data.append(a_class_data)
     return extracted_data
コード例 #3
0
    def decode(self, img, first_strip=False):
        softstrip_matrix = SoftstripMatrix(img, self.gray_img)
        header_extractor = HeaderExtractor(softstrip_matrix)
        header_extractor.parse_header()
        vertical_sync_start = header_extractor.vertical_sync_start
        self.bits_count = header_extractor.get_bits_per_row()

        if self.config['row_extractor'] == CNN_ROW_EXTRACTOR:
            row_extractor = CnnRowExtractor(softstrip_matrix.grayscale_matrix, softstrip_matrix.binary_matrix, self.bits_count)
            gray_grouped_matrix, grouped_matrix = row_extractor.extract_rows()
        else:
            row_extractor = AlgorithmicRowExtractor(softstrip_matrix, self.bits_count)
            grouped_matrix, gray_grouped_matrix = row_extractor.extract_rows()

        if self.config['row_decoder'] == CNN_ROW_DECODER:
            row_decoder = CnnRowDecoder(gray_grouped_matrix, self.start_time, self.bits_count, self.config['timeout'], vertical_sync_start)
            reduced_pixel_matrix = row_decoder.decode_rows()
        else:
            row_decoder = AlgorithmicRowDecoder(grouped_matrix, self.bits_count, self.start_time, self.config['timeout'])
            reduced_pixel_matrix = row_decoder.decode_rows()

        if len(reduced_pixel_matrix) == 0:
            print('[ERROR] ' + self.path + ' is invalid!')
        else:
            data_extractor = DataExtractor(self.config['timeout'])
            data_extractor.extract_data(reduced_pixel_matrix, first_strip, self.start_time)

            self.data += data_extractor.data

            
            if data_extractor.valid:
                print('Checksum valid!')
                if first_strip:
                    self.strip_meta_info = data_extractor.file_header
                    print(self.strip_meta_info)
            else:
                print('Checksum invalid!')
コード例 #4
0
    aws_client = AWSClient(aws_access_key_id=aws_access_key_id,
                           aws_secret_access_key=aws_secret_access_key)

    #Constants for Data Extraction
    LOGIN_URL = 'https://www.lendingclub.com/auth/login'
    DOWNLOAD_URL = 'https://www.lendingclub.com/info/download-data.action'
    DIR_PATH = 'Data\DOWNLOAD_LOAN_DATA'

    #Parameters for lending club data scraping
    fileTag = "loanStatsFileNamesJS"
    # email=  os.environ['LENDING_CLUB_EMAIL']
    # password= os.environ['LENDING_CLUB_PASSWORD']

    print('Downloading Files...')
    #Extract Data from Lending Club URL
    de = DataExtractor(email, password)
    de.extractData(LOGIN_URL=LOGIN_URL,
                   DOWNLOAD_URL=DOWNLOAD_URL,
                   fileTag=fileTag)

    print('Ingesting Data...')
    #Ingest Data into Pipeline
    di = DataIngestor(aws_client)

    #Create Landing and Processed Buckets
    LANDING_BUCKET = 'lending-club-landing-data'
    PROCESSED_BUCKET = 'lending-club-processed-data'

    print('Creating Buckets...')
    di.createS3Bucket(LANDING_BUCKET)
    di.createS3Bucket(PROCESSED_BUCKET)
コード例 #5
0
def get_arguments(argument_list):
    short_options = "d:"
    long_options = ["document="]
    try:
        document_file = ''
        arguments, values = getopt.getopt(argument_list, short_options,
                                          long_options)
        print(arguments)
        if len(arguments) < 1:
            print("Invalid arguments")
            sys.exit(2)
        for t in arguments:
            if t[0] in ("-d", "--document"):
                document_file = t[1]
                print(document_file)
        return document_file
    except getopt.error as err:
        print(str(err))
        sys.exit(2)


if __name__ == '__main__':
    wiki_10_file = get_arguments(sys.argv[1:])
    inverted_index = InvertedIndex()
    data_extractor = DataExtractor()
    structure_file_name = data_extractor.extract_data(wiki_10_file)
    index_file, vector_file = inverted_index.build_term_index(
        structure_file_name, wiki_10_file)
    print("Index file name: ", index_file)
    print("Vector file name: ", vector_file)
コード例 #6
0
from DataExtractor import DataExtractor
from TextProcessing import TextProcessing
from TextClassifier import TextClassifier
import pandas as pd

train, target = DataExtractor('data.csv').data_producer('train')
test = DataExtractor('test.csv').data_producer('test')

text_classifier = TextClassifier()

(x_train, y_train), (x_validation,
                     y_validation) = text_classifier.split_validation_data(
                         train, target)

cleaned_x_train = TextProcessing(x_train).clean_text()
cleaned_x_validation = TextProcessing(x_validation).clean_text()
cleaned_x_test = TextProcessing(test).clean_text()

text_classifier.fit(cleaned_x_train, y_train)

text_classifier.evaluate(cleaned_x_validation, y_validation)

text_classifier.confusion(y_validation, cleaned_x_validation)

result = text_classifier.predict(cleaned_x_test)
pd.DataFrame(result, columns=['category']).to_csv('output.csv',
                                                  index=True,
                                                  index_label='index')
コード例 #7
0
ファイル: Data_EL.py プロジェクト: swathysujit/Chicago-Crimes
def main():

    total_start_time = time.time()

    # ------------------------------------------------------------------------ #
    # 0. PARSE INPUT ARGUMENTS
    # ------------------------------------------------------------------------ #

    data_file_name = "Crime_Weather_Cleaned_2017.csv"
    # data_file_name = "Crime20161718.csv"

    data_file_path = os.path.join(FOLDER_PATH, data_file_name)

    # ------------------------------------------------------------------------ #
    # 1. ESTABLISH DATABASE CONNECTION
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 1. DATABASE CONNECTION **** ")

    # host = 'localhost'
    # database = 'crime_star'
    # user = '******'
    # password = '******'
    # port = '3306'

    port = '3306'

    data_loader = DataLoader()

    ret = data_loader.connect(host=DB_IP,
                              database=DB,
                              user=DB_UNAME,
                              password=DB_PWD,
                              port=port)

    if ret != 1:
        print(" Connection not established. Try again")
        print(" Check internet connectivity")
        return ret

    # ------------------------------------------------------------------------ #
    # 2. DATA EXTRACTION PHASE
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 2. DATA EXTRACTION **** ")

    data_extractor = DataExtractor()
    data_frame = data_extractor.read_csv(fpath=data_file_path,
                                         nrows_to_read=-1)

    # ------------------------------------------------------------------------ #
    # 3. DATA LOADING PHASE
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 2. DATA LOADING **** ")

    ret = data_loader.load_full_table(data_frame, table_name=RAW_TABLE_NAME)

    if ret == -1:
        print(" Could not upload to database ")
        data_loader.disconnect()
        return

    print("Successfully populated database")

    # ------------------------------------------------------------------------ #
    # 4. DISCONNECT THE DATABASE AND CLEAN UP MEMORY
    # ------------------------------------------------------------------------ #

    data_loader.disconnect()

    # ------------------------------------------------------------------------ #
    # 5. SEND A MESSAGE TO THE DATA HUB AS AN UPDATE
    # ------------------------------------------------------------------------ #

    print(" Sending message to data hub for update....", end="")

    messenger = Messenger()
    # Connect to the data hub
    messenger.connect(host=DATA_HUB_IP, uname=DATA_HUB_UNAME, pwd=DATA_HUB_PWD)

    # Connect to the exchange
    messenger.connect_to_exchange(ex_name=EX_NAME)

    # Send update
    message = "Database updated with latest rows"
    messenger.send_message_to_exchange(ex_name=EX_NAME,
                                       message=message,
                                       topic=TOPIC)

    print("sent")

    total_end_time = time.time()

    print(" Total time taken :", total_end_time - total_start_time)
コード例 #8
0
#!/usr/bin/python

import sys
from DataExtractor import DataExtractor

deExtractor = DataExtractor()
deExtractor.extractData(sys.argv[1])
コード例 #9
0
            "dim_date": dim_date,
            "dim_weather": dim_weather,
            "dim_location": dim_location,
            "dim_crime": dim_crime,
        }
        return star_tables


if __name__ == "__main__":

    folder_path = "C:\\Users\\SSrih\\OneDrive\\UChicago\\DEP\\Project\\data" \
                  "\\Crime and Weather\\"
    # data_file_name = "CrimeWeather2010.csv"
    # data_file_name = "Crime2010Raw.csv"
    data_file_name = "Crime20161718.csv"

    data_file_path = os.path.join(folder_path, data_file_name)
    data_extractor = DataExtractor()
    data_frame = data_extractor.read_csv(fpath=data_file_path,
                                         nrows_to_read=5000)

    # print(data_frame.head())

    data_worker = DataWorker()

    print(data_frame.isnull().sum().sum())

    data_worker.process_pipeline(data_frame)

    print(data_frame.isnull().sum().sum())
コード例 #10
0
ファイル: extract_data.py プロジェクト: SUTURO/euroc_ml
#!/usr/bin/python

import sys
from DataExtractor import DataExtractor


deExtractor = DataExtractor()
deExtractor.extractData(sys.argv[1])
コード例 #11
0
 def run(self):
     try:
         logInfo(self.userId, 'start crawling')
         de = DataExtractor()
         de.getServerId(self.userId)
         de.getWeightHistory(self.userId)
         de.getDietHistory(self.userId)
         de.getGroup(self.userId)
         de.getChallenge(self.userId)
         de.getBuddy(self.userId)
         logInfo(self.userId, 'Done crawling')
     except Exception as e:
         logException(self.userId, self.run.__name__, e)
コード例 #12
0
from DataExtractor import DataExtractor
from Geolocation import Location
from DataFilter import DataFilter

import json

dataExObj = DataExtractor(dir='./dataset')
allFiles = dataExObj.FileList(ext='.json')

jsonFileDict = dataExObj.parse(allFiles)

#Data Filtering

#Step 1. Join reviews and business by business id's
datafilter = DataFilter()
data = datafilter.JoinByAttribute(jsonFileDict['business'],
                                  jsonFileDict['review'], 'business_id')

#Step 2: Get the location.

loc = Location()

for key, value in data.items():
    latitude = data[key]['latitude']
    longitude = data[key]['longitude']
    location = loc.GetLocation(latitude, longitude)
    data[key] = datafilter.merge_dicts(data[key], {'location': location})
    print(data[key]['location'])

#TODO Step 3
#Step 3: Filter out location outside United States.
コード例 #13
0
from DataExtractor import DataExtractor
if __name__ == 'main':
    Extractor = DataExtractor()
    R = Extractor.Extract("realDonaldTrump")
    print(R)

Extractor = DataExtractor()
R = Extractor.Extract("realDonaldTrump")
print(R)
コード例 #14
0
import os
from DataExtractor import DataExtractor
from StridesIdentifier import StridesIdentifier
from FeaturesExtractor import FeaturesExtractor
from ClassifiersEvaluator import ClassifiersEvaluator
from Train import Train
from InfluxCalculator import InfluxCalculator
from Classifier import Classifier

models_dir = '../models'
# if train models folder is empty
if len(os.listdir(models_dir)) == 0:
    # train phase
    #function to extract frames from videos and to create txt files with data of interest from each video
    de = DataExtractor('train')
    videos_list = de.extractor()
    #function to identifier strides from data (in txt files) about each video and to store strides informations in txt files
    si = StridesIdentifier(videos_list)
    si.identifier()
    #function to extract all features to analize about each person
    fe = FeaturesExtractor(videos_list)
    people_list = fe.extractor()
    #function to evaluate classifiers
    ce = ClassifiersEvaluator(people_list)
    ce.evaluator()
    #function to train classifiers
    t = Train(people_list)
    t.training()
#test phase
#function to extract frames from videos and to create txt files with data of interest from each video
de = DataExtractor('test')
コード例 #15
0
# Clock to time what the running of the program
start_time = time.time()

# Setting up directory
mypath = os.getcwd() + '/data/seminar_testdata/test_untagged/'
trainingPath = 'data/training'
directory = os.fsencode(mypath)

# Runs the ontology classification
print('Running Ontology Classification: \n')
ontology = Ontology()
ontology.run(mypath)

# Begins tagging
print("\nTagging progress beginning. Get a brew, it'll take a while... \n")
extractor = DataExtractor()

# Trains our model
extractor.train(trainingPath)
tagger = Tagger()

# Tags all emails in the directory given
tagger.tag_seminar(mypath, directory, extractor)

# Calculates how long the program took
seconds = time.time() - start_time
m, s = divmod(seconds, 60)
print("The program has been running for {0} minutes and {1} seconds \n".format(
    round(m), round(s)))

# Evaluates results
コード例 #16
0
from sklearn.linear_model import SGDClassifier
from sklearn import svm, preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from DataExtractor import DataExtractor

trainExtractor = DataExtractor(5)
trainData = trainExtractor.featureDictionary
for i in range(6, 10):
    trainData.update(DataExtractor(i).featureDictionary)

trainInput = list()
trainOutput = list()

for value in trainData.values():
    inputData = value[0]
    trainInput.append(inputData[0].values() + inputData[1].values())
    trainOutput.append(1 if value[1] == 1 else 0)

#clf = SGDClassifier(loss="log", penalty="elasticnet")
#clf = SGDClassifier(loss="hinge")
clf = svm.SVC()
#clf = GradientBoostingClassifier(n_estimators=30, max_depth=3, subsample=.7)
#clf = KNeighborsClassifier(n_neighbors=3)
scaledTrainInput = preprocessing.scale(trainInput)
clf.fit(trainInput, trainOutput)

testExtractor = DataExtractor(10)
testData = testExtractor.featureDictionary
for i in range(11, 13):
    testData.update(DataExtractor(i).featureDictionary)
コード例 #17
0
 def __init__(self, filepath):
     # self.x_train, self.y_train = DataExtractor(filepath).split_labels()
     self.x_train, self.x_test, self.y_train, self.y_test = DataExtractor(
         filepath).split_validation()
コード例 #18
0
from DataExtractor import DataExtractor
import sys
username = sys.argv[1]
KEY = sys.argv[2]
SECRET = sys.argv[3]
filters = ""
Extractor = DataExtractor()
Extractor.Extract(username, filters, KEY, SECRET)
コード例 #19
0
from DataExtractor import DataExtractor

Extractor = DataExtractor()
Extractor.Extract()