Python DataImport Examples, DataImport Python Examples

Example #1

0

Show file

File: DataTest.py Project: egman24/parsing

def create_citation(citer_document, cited_document):
    print('creating citation: ' + citer_document['doc_number'] + '->' +
          cited_document['doc_number'])
    DataImport.add_citation(
        citer_document['doc_number'], cited_document['doc_number'],
        citer_document['country'], citer_document['kind'],
        citer_document['datepublished'], citer_document['filedate'],
        citer_document['issuedate'], citer_document['prioritydate'])

Example #2

0

Show file

File: LogisticRegressionModel.py Project: roshan2M/kaggle

def get_lr_model_test_accuracy() -> float:
    titanic_test_set = di.get_clean_test_data()
    titanic_test_X = titanic_test_set[COLUMN_NAMES]
    titanic_test_y = di.get_titanic_test_results()['Survived']
    lr_model = get_lr_model()
    predictions = lr_model.predict(titanic_test_X)
    save_results_in_csv(predictions, titanic_test_set)
    return accuracy_score(titanic_test_y, predictions)

Example #3

0

Show file

def plot_survival_by_age_category():
    train_data = di.get_titanic_data()
    train_data = di.filter_age(train_data)
    age_categories_pivot = train_data.pivot_table(index="Age_category",
                                                  values="Survived")
    age_categories_pivot.plot.bar()
    plt.title("Graph to Show Survival Rate by Age Category")
    plt.xlabel("Age Group")
    plt.ylabel("Survival Rate")
    plt.savefig("Graph to Show Survival Rate by Age Category")

Example #4

0

Show file

File: EPOFull.py Project: egman24/parsing

def to_database(metadata):
    print('------------------------------------------------')
    pprint(metadata)

    dnum = metadata['dnum']
    doc_number = metadata['doc-number']
    kind = metadata['kind']
    date_publ = metadata['date-publ']
    status = metadata['status']
    country = metadata['country']
    abstract = metadata['abstract']
    date_file = metadata['date-file']
    date_issue = metadata['date-issue']
    date_priority = metadata['date-priority']
    citations = metadata['citations']
    classifications = metadata['classifications']
    assignees = metadata['assignees']

    DataImport.create_document('FullText', '?', dnum, doc_number, kind, date_publ,
                               status, country, '', abstract, date_file, date_issue, date_priority)

    for citation in citations:
        DataImport.add_citation(doc_number, citation.get('doc-number', ''), citation.get(
            'country', ''), citation.get('kind', ''), date_publ, date_file, date_issue, date_priority)

    for classification in classifications:
        DataImport.add_classification(doc_number, classification.get('section', ''), classification.get('class', ''), classification.get(
            'subclass', ''), classification.get('main-group', ''), classification.get('subgroup', ''), date_publ, date_file, date_issue, date_priority)

    for assignee in assignees:
        DataImport.add_assignee(doc_number, assignee.get('name', ''), assignee.get('epo-number', ''), assignee.get(
            'reference', ''), assignee.get('cross-reference', ''), date_publ, date_file, date_issue, date_priority)

    print('------------------------------------------------')
    print('\n')

Example #5

0

Show file

File: Evaluation.py Project: roshan2M/kaggle

def get_train_test_comment_classes():
    toxic_comments_train_set = di.load_train_data()
    train_idx, test_idx = split_data(toxic_comments_train_set)

    train_classes = toxic_comments_train_set[di.CATEGORIES].loc[train_idx]
    test_classes = toxic_comments_train_set[di.CATEGORIES].loc[test_idx]
    return train_classes, test_classes

Example #6

0

Show file

def main():
    """ Data import and analysis is performed using the two kernels. The analysis is stored in a text file."""

    wb = xl.open_workbook(sys.argv[1])  #retrice dataset from the excel file
    S = wb.sheet_by_index(0)  #dataset sheet
    allData = DI.get_patient_ts(
        S, 7, 6)  #Convert import data to our format data structure
    allData = DI.remove_missing_tp(
        allData, S, 7,
        6)  #remove missing time points from the time series data

    normalizeValues(allData)  #normalize data

    kernel = sys.argv[2]
    #Analysis for TWED kernel
    if kernel == "twed":
        Analysis = {
        }  #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc,Nu and Lambda
        f = open("analysisTWED.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData,
                                          i)  #split the time series data
            key = kernel + "T" + str(i)
            Analysis[key] = chooseOptimalModel(
                data, kernel)  #perform model selection
            strToWrite = key + ":   " + str(Analysis[key][0]) + "   " + str(
                Analysis[key][1]) + "   " + str(
                    Analysis[key][2]) + "   " + str(
                        Analysis[key][3]) + "   " + str(
                            Analysis[key][4]) + "\n"
            f.write(strToWrite)
        f.close()
    else:

        #Analysis for GERP kernel
        Analysis = {
        }  #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc
        f = open("analysisGERP.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData, i)
            key = kernel + "T" + str(i)
            Analysis[key] = chooseOptimalModel(
                data, kernel)  #perform model selection
            strToWrite = key + ":   " + str(Analysis[key][0]) + "   " + str(
                Analysis[key][1]) + "   " + str(Analysis[key][2]) + "\n"
            f.write(strToWrite)
        f.close()

Example #7

0

Show file

def plot_survival_by_class():
    train_data = di.get_titanic_data()
    pclass_table = train_data.pivot_table(index="Pclass", values="Survived")
    pclass_table.plot.bar()
    plt.title("Graph to Show Survival Rate by Class")
    plt.xlabel("Class")
    plt.ylabel("Survival Rate")
    plt.savefig("Graph to Show Survival Rate by Class")

Example #8

0

Show file

File: Evaluation.py Project: roshan2M/kaggle

def get_train_test_comment_vectors():
    toxic_comments_train_set = di.load_train_data()
    comment_vectors = vc.get_comment_vectors(toxic_comments_train_set)
    train_idx, test_idx = split_data(toxic_comments_train_set)

    train_vectors = comment_vectors[train_idx]
    test_vectors = comment_vectors[test_idx]
    return train_vectors, test_vectors

Example #9

0

Show file

def plot_survival_by_gender():
    train_data = di.get_titanic_data()
    gender_table = train_data.pivot_table(index="Sex", values="Survived")
    gender_table.plot.bar()
    plt.title("Graph to Show Survival Rate by Gender")
    plt.xlabel("Gender")
    plt.ylabel("Survival Rate")
    plt.savefig("Graph to Show Survival Rate by Gender")

Example #10

0

Show file

def plot_feature_heatmap():
    train_data = di.get_titanic_data()
    plt.figure(figsize=(14, 12))
    correlation_matrix = train_data.corr(method='spearman')
    sns.heatmap(correlation_matrix, square=True, annot=True)
    plt.title("Heatmap of Correlation of Different Features", size=24)
    plt.xlabel("Feature", size=20)
    plt.ylabel("Feature", size=20)
    plt.savefig("Heatmap of Correlation of Different Features")

Example #11

0

Show file

File: DataTest.py Project: egman24/parsing

def create_document():
    kind = get_kind()
    doc_number = get_doc_number()
    country = get_country()
    dnum = country + doc_number + kind
    print('creating document: ' + doc_number)
    document = DataImport.create_document('Test', '?', dnum, doc_number, kind,
                                          get_date(), 'n', country, '', '',
                                          get_date(), get_date(), get_date())
    return document.data()[0]['doc'].properties

Example #12

0

Show file

def plot_survival_by_age():
    train_data = di.get_titanic_data()
    survived = train_data[train_data["Survived"] == 1]
    died = train_data[train_data["Survived"] == 0]
    survived["Age"].plot.hist(alpha=0.5, color="green", bins=50)
    died["Age"].plot.hist(alpha=0.5, color="red", bins=50)
    plt.title("Graph to Show Survival Rate by Age")
    plt.xlabel("Age (years)")
    plt.ylabel("Survival Rate")
    plt.legend(["Survived", "Died"])
    plt.savefig("Graph to Show Survival Rate by Age")

Example #13

0

Show file

def plot_survival_by_fare():
    train_data = di.get_titanic_data()
    plt.figure(figsize=(15, 8))
    died_fare = train_data.loc[(train_data["Survived"] == 0), "Fare"]
    survived_fare = train_data.loc[(train_data["Survived"] == 1), "Fare"]
    sns.kdeplot(died_fare, color="gray", shade=True, label="Not Survived")
    sns.kdeplot(survived_fare, color="g", shade=True, label="Survived")
    plt.title("Graph to Show Survival Rate by Fare")
    plt.xlabel("Fare")
    plt.ylabel("Frequency of Passengers")
    plt.savefig("Graph to Show Survival Rate by Fare")

Example #14

0

Show file

File: MainFile.py Project: CHENXINGZHANG1997/02710Project

def main():
    wb = xl.open_workbook("Dataset_S1.xls")
    S = wb.sheet_by_index(0)
    allData = DI.get_patient_ts(S, 7, 6)
    #for (label, time, patient,ts) in allData:
    #    if(patient == str(1185163.0)):
    #        print ts
    allData = DI.remove_missing_tp(allData, S, 7, 6)
    #for (label, time, patient,ts) in allData:
    #if(patient == str(1185163.0)):
    #print ts
    #print allData[0]
    seg_data = cv.gen_time_split_data(allData, 4, 2, 5)
    for val in seg_data:
        (train, test) = val
        print("This is new iteration")
        for (label, time, patient, ts) in train:
            print label, time, patient, len(ts)
        print("This is new dataset")
        for (label, time, patient, ts) in test:
            print label, time, patient, len(ts)

Example #15

0

Show file

File: MainFile.py Project: bhushanRamnani/02710Project

def main():
    wb = xl.open_workbook("Dataset_S1.xls")
    S = wb.sheet_by_index(0)
    allData = DI.get_patient_ts(S, 7, 6)
    #for (label, time, patient,ts) in allData:
    #    if(patient == str(1185163.0)):
    #        print ts 
    allData = DI.remove_missing_tp(allData, S, 7, 6)
    #for (label, time, patient,ts) in allData:
        #if(patient == str(1185163.0)):
            #print ts 
    #print allData[0]
    seg_data = cv.gen_time_split_data(allData, 4, 2, 5)
    for val in seg_data:
        (train, test) = val
        print ("This is new iteration")
        for(label, time, patient, ts) in train:
            print label, time, patient, len(ts)
        print("This is new dataset")
        for(label, time, patient, ts) in test:
            print label, time, patient, len(ts)

Example #16

0

Show file

File: executeAnalysis.py Project: bhushanRamnani/02710Project

def main():
    """ Data import and analysis is performed using the two kernels. The analysis is stored in a text file."""
    
    
    wb = xl.open_workbook(sys.argv[1]) #retrice dataset from the excel file
    S = wb.sheet_by_index(0) #dataset sheet
    allData = DI.get_patient_ts(S, 7, 6) #Convert import data to our format data structure
    allData = DI.remove_missing_tp(allData, S, 7, 6) #remove missing time points from the time series data
    
       
    normalizeValues(allData) #normalize data
    
    kernel = sys.argv[2]
    #Analysis for TWED kernel
    if kernel == "twed":
        Analysis = {} #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc,Nu and Lambda
        f = open("analysisTWED.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData, i) #split the time series data
            key = kernel+"T"+str(i)
            Analysis[key] = chooseOptimalModel(data, kernel) #perform model selection 
            strToWrite = key+":   "+str(Analysis[key][0])+"   "+str(Analysis[key][1])+"   "+str(Analysis[key][2])+"   "+str(Analysis[key][3])+"   "+str(Analysis[key][4])+"\n"
            f.write(strToWrite)
        f.close()
    else:

    #Analysis for GERP kernel
        Analysis = {} #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc
        f = open("analysisGERP.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData, i)
            key = kernel+"T"+str(i)
            Analysis[key] = chooseOptimalModel(data, kernel) #perform model selection
            strToWrite = key+":   "+str(Analysis[key][0])+"   "+str(Analysis[key][1])+"   "+str(Analysis[key][2])+"\n"
            f.write(strToWrite)
        f.close()

Example #17

0

Show file

 def create_unit(self):
     info = {}
     info['faction'] = str(self.faction_select.currentText())
     faction = self.unit_data.factions[info['faction']]
     if faction:
         info['name'] = faction.unit_name
         info['faction_icon'] = faction.faction_icon
         info['desc'] = faction.desc
     info['level'] = int(self.level.value())
     info['gender'] = int(self.gender.value())
     info['klass'] = str(self.class_box.currentText())
     info['items'] = self.getItems()
     info['ai'] = self.get_ai()
     info['ai_group'] = str(self.ai_group.text())
     info['team'] = str(self.team_box.currentText())
     info['generic'] = True
     info['mode'] = self.get_modes()
     created_unit = DataImport.Unit(info)
     return created_unit

Example #18

0

Show file

 def create_unit(self):
     info = {}
     info['faction'] = str(self.faction_select.currentText())
     faction = self.unit_data.factions[info['faction']]
     if faction:
         info['name'] = faction.unit_name
         info['faction_icon'] = faction.faction_icon
         info['desc'] = faction.desc
     info['level'] = int(self.level.value())
     info['gender'] = int(self.gender.value())
     info['klass'] = str(self.class_box.currentText())
     info['items'] = self.getItems()
     info['ai'] = self.get_ai()
     info['ai_group'] = str(self.ai_group.text())
     info['pack'] = str(self.pack.text())
     info['event_id'] = EditorUtilities.next_available_event_id([rein for rein in self.unit_data.reinforcements if rein.pack == info['pack']])
     info['team'] = str(self.team_box.currentText())
     info['generic'] = True
     info['mode'] = self.get_modes()
     created_unit = DataImport.Unit(info)
     return created_unit

Example #19

0

Show file

File: App.py Project: XaviMedina/UOCDataCleaning

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import DataCleaning
import DataExploration
import DataImport
import Prediction

# Carreguem les dades
test_ds = DataImport.import_test_data()
train_ds = DataImport.import_train_data()

# Explorem la relació de les columnes amb la variable Survived
# DataExploration.show_survive_relation_by_feature(train_ds, 'Sex')
# DataExploration.show_survive_relation_by_feature(train_ds, 'Age')
# DataExploration.show_scatter_plot_by_features(train_ds, 'Fare', 'Age')
combined_ds = DataImport.combine_datasets(train_ds, test_ds)

# Fem una exploració de les dades per a veure si tenim valors nulls
DataExploration.explore_dataset(combined_ds)
combined_ds = DataExploration.get_titles(combined_ds)

# Netejem les dades
dataCleaningObj = DataCleaning.DataCleaning(combined_ds)
combined_ds = dataCleaningObj.clean()

# Test Anderson
DataExploration.anderson_darling_test(combined_ds['Age'])
DataExploration.anderson_darling_test(combined_ds['Fare'])
DataExploration.anderson_darling_test(combined_ds['Sex'])
DataExploration.anderson_darling_test(combined_ds['Family'])

Example #20

0

Show file