Example #1
0
def create_citation(citer_document, cited_document):
    print('creating citation: ' + citer_document['doc_number'] + '->' +
          cited_document['doc_number'])
    DataImport.add_citation(
        citer_document['doc_number'], cited_document['doc_number'],
        citer_document['country'], citer_document['kind'],
        citer_document['datepublished'], citer_document['filedate'],
        citer_document['issuedate'], citer_document['prioritydate'])
def get_lr_model_test_accuracy() -> float:
    titanic_test_set = di.get_clean_test_data()
    titanic_test_X = titanic_test_set[COLUMN_NAMES]
    titanic_test_y = di.get_titanic_test_results()['Survived']
    lr_model = get_lr_model()
    predictions = lr_model.predict(titanic_test_X)
    save_results_in_csv(predictions, titanic_test_set)
    return accuracy_score(titanic_test_y, predictions)
Example #3
0
def plot_survival_by_age_category():
    train_data = di.get_titanic_data()
    train_data = di.filter_age(train_data)
    age_categories_pivot = train_data.pivot_table(index="Age_category",
                                                  values="Survived")
    age_categories_pivot.plot.bar()
    plt.title("Graph to Show Survival Rate by Age Category")
    plt.xlabel("Age Group")
    plt.ylabel("Survival Rate")
    plt.savefig("Graph to Show Survival Rate by Age Category")
Example #4
0
def to_database(metadata):
    print('------------------------------------------------')
    pprint(metadata)

    dnum = metadata['dnum']
    doc_number = metadata['doc-number']
    kind = metadata['kind']
    date_publ = metadata['date-publ']
    status = metadata['status']
    country = metadata['country']
    abstract = metadata['abstract']
    date_file = metadata['date-file']
    date_issue = metadata['date-issue']
    date_priority = metadata['date-priority']
    citations = metadata['citations']
    classifications = metadata['classifications']
    assignees = metadata['assignees']

    DataImport.create_document('FullText', '?', dnum, doc_number, kind, date_publ,
                               status, country, '', abstract, date_file, date_issue, date_priority)

    for citation in citations:
        DataImport.add_citation(doc_number, citation.get('doc-number', ''), citation.get(
            'country', ''), citation.get('kind', ''), date_publ, date_file, date_issue, date_priority)

    for classification in classifications:
        DataImport.add_classification(doc_number, classification.get('section', ''), classification.get('class', ''), classification.get(
            'subclass', ''), classification.get('main-group', ''), classification.get('subgroup', ''), date_publ, date_file, date_issue, date_priority)

    for assignee in assignees:
        DataImport.add_assignee(doc_number, assignee.get('name', ''), assignee.get('epo-number', ''), assignee.get(
            'reference', ''), assignee.get('cross-reference', ''), date_publ, date_file, date_issue, date_priority)

    print('------------------------------------------------')
    print('\n')
Example #5
0
def get_train_test_comment_classes():
    toxic_comments_train_set = di.load_train_data()
    train_idx, test_idx = split_data(toxic_comments_train_set)

    train_classes = toxic_comments_train_set[di.CATEGORIES].loc[train_idx]
    test_classes = toxic_comments_train_set[di.CATEGORIES].loc[test_idx]
    return train_classes, test_classes
Example #6
0
def main():
    """ Data import and analysis is performed using the two kernels. The analysis is stored in a text file."""

    wb = xl.open_workbook(sys.argv[1])  #retrice dataset from the excel file
    S = wb.sheet_by_index(0)  #dataset sheet
    allData = DI.get_patient_ts(
        S, 7, 6)  #Convert import data to our format data structure
    allData = DI.remove_missing_tp(
        allData, S, 7,
        6)  #remove missing time points from the time series data

    normalizeValues(allData)  #normalize data

    kernel = sys.argv[2]
    #Analysis for TWED kernel
    if kernel == "twed":
        Analysis = {
        }  #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc,Nu and Lambda
        f = open("analysisTWED.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData,
                                          i)  #split the time series data
            key = kernel + "T" + str(i)
            Analysis[key] = chooseOptimalModel(
                data, kernel)  #perform model selection
            strToWrite = key + ":   " + str(Analysis[key][0]) + "   " + str(
                Analysis[key][1]) + "   " + str(
                    Analysis[key][2]) + "   " + str(
                        Analysis[key][3]) + "   " + str(
                            Analysis[key][4]) + "\n"
            f.write(strToWrite)
        f.close()
    else:

        #Analysis for GERP kernel
        Analysis = {
        }  #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc
        f = open("analysisGERP.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData, i)
            key = kernel + "T" + str(i)
            Analysis[key] = chooseOptimalModel(
                data, kernel)  #perform model selection
            strToWrite = key + ":   " + str(Analysis[key][0]) + "   " + str(
                Analysis[key][1]) + "   " + str(Analysis[key][2]) + "\n"
            f.write(strToWrite)
        f.close()
Example #7
0
def plot_survival_by_class():
    train_data = di.get_titanic_data()
    pclass_table = train_data.pivot_table(index="Pclass", values="Survived")
    pclass_table.plot.bar()
    plt.title("Graph to Show Survival Rate by Class")
    plt.xlabel("Class")
    plt.ylabel("Survival Rate")
    plt.savefig("Graph to Show Survival Rate by Class")
Example #8
0
def get_train_test_comment_vectors():
    toxic_comments_train_set = di.load_train_data()
    comment_vectors = vc.get_comment_vectors(toxic_comments_train_set)
    train_idx, test_idx = split_data(toxic_comments_train_set)

    train_vectors = comment_vectors[train_idx]
    test_vectors = comment_vectors[test_idx]
    return train_vectors, test_vectors
Example #9
0
def plot_survival_by_gender():
    train_data = di.get_titanic_data()
    gender_table = train_data.pivot_table(index="Sex", values="Survived")
    gender_table.plot.bar()
    plt.title("Graph to Show Survival Rate by Gender")
    plt.xlabel("Gender")
    plt.ylabel("Survival Rate")
    plt.savefig("Graph to Show Survival Rate by Gender")
Example #10
0
def plot_feature_heatmap():
    train_data = di.get_titanic_data()
    plt.figure(figsize=(14, 12))
    correlation_matrix = train_data.corr(method='spearman')
    sns.heatmap(correlation_matrix, square=True, annot=True)
    plt.title("Heatmap of Correlation of Different Features", size=24)
    plt.xlabel("Feature", size=20)
    plt.ylabel("Feature", size=20)
    plt.savefig("Heatmap of Correlation of Different Features")
Example #11
0
def create_document():
    kind = get_kind()
    doc_number = get_doc_number()
    country = get_country()
    dnum = country + doc_number + kind
    print('creating document: ' + doc_number)
    document = DataImport.create_document('Test', '?', dnum, doc_number, kind,
                                          get_date(), 'n', country, '', '',
                                          get_date(), get_date(), get_date())
    return document.data()[0]['doc'].properties
Example #12
0
def plot_survival_by_age():
    train_data = di.get_titanic_data()
    survived = train_data[train_data["Survived"] == 1]
    died = train_data[train_data["Survived"] == 0]
    survived["Age"].plot.hist(alpha=0.5, color="green", bins=50)
    died["Age"].plot.hist(alpha=0.5, color="red", bins=50)
    plt.title("Graph to Show Survival Rate by Age")
    plt.xlabel("Age (years)")
    plt.ylabel("Survival Rate")
    plt.legend(["Survived", "Died"])
    plt.savefig("Graph to Show Survival Rate by Age")
Example #13
0
def plot_survival_by_fare():
    train_data = di.get_titanic_data()
    plt.figure(figsize=(15, 8))
    died_fare = train_data.loc[(train_data["Survived"] == 0), "Fare"]
    survived_fare = train_data.loc[(train_data["Survived"] == 1), "Fare"]
    sns.kdeplot(died_fare, color="gray", shade=True, label="Not Survived")
    sns.kdeplot(survived_fare, color="g", shade=True, label="Survived")
    plt.title("Graph to Show Survival Rate by Fare")
    plt.xlabel("Fare")
    plt.ylabel("Frequency of Passengers")
    plt.savefig("Graph to Show Survival Rate by Fare")
def main():
    wb = xl.open_workbook("Dataset_S1.xls")
    S = wb.sheet_by_index(0)
    allData = DI.get_patient_ts(S, 7, 6)
    #for (label, time, patient,ts) in allData:
    #    if(patient == str(1185163.0)):
    #        print ts
    allData = DI.remove_missing_tp(allData, S, 7, 6)
    #for (label, time, patient,ts) in allData:
    #if(patient == str(1185163.0)):
    #print ts
    #print allData[0]
    seg_data = cv.gen_time_split_data(allData, 4, 2, 5)
    for val in seg_data:
        (train, test) = val
        print("This is new iteration")
        for (label, time, patient, ts) in train:
            print label, time, patient, len(ts)
        print("This is new dataset")
        for (label, time, patient, ts) in test:
            print label, time, patient, len(ts)
Example #15
0
def main():
    wb = xl.open_workbook("Dataset_S1.xls")
    S = wb.sheet_by_index(0)
    allData = DI.get_patient_ts(S, 7, 6)
    #for (label, time, patient,ts) in allData:
    #    if(patient == str(1185163.0)):
    #        print ts 
    allData = DI.remove_missing_tp(allData, S, 7, 6)
    #for (label, time, patient,ts) in allData:
        #if(patient == str(1185163.0)):
            #print ts 
    #print allData[0]
    seg_data = cv.gen_time_split_data(allData, 4, 2, 5)
    for val in seg_data:
        (train, test) = val
        print ("This is new iteration")
        for(label, time, patient, ts) in train:
            print label, time, patient, len(ts)
        print("This is new dataset")
        for(label, time, patient, ts) in test:
            print label, time, patient, len(ts)
def main():
    """ Data import and analysis is performed using the two kernels. The analysis is stored in a text file."""
    
    
    wb = xl.open_workbook(sys.argv[1]) #retrice dataset from the excel file
    S = wb.sheet_by_index(0) #dataset sheet
    allData = DI.get_patient_ts(S, 7, 6) #Convert import data to our format data structure
    allData = DI.remove_missing_tp(allData, S, 7, 6) #remove missing time points from the time series data
    
       
    normalizeValues(allData) #normalize data
    
    kernel = sys.argv[2]
    #Analysis for TWED kernel
    if kernel == "twed":
        Analysis = {} #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc,Nu and Lambda
        f = open("analysisTWED.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData, i) #split the time series data
            key = kernel+"T"+str(i)
            Analysis[key] = chooseOptimalModel(data, kernel) #perform model selection 
            strToWrite = key+":   "+str(Analysis[key][0])+"   "+str(Analysis[key][1])+"   "+str(Analysis[key][2])+"   "+str(Analysis[key][3])+"   "+str(Analysis[key][4])+"\n"
            f.write(strToWrite)
        f.close()
    else:

    #Analysis for GERP kernel
        Analysis = {} #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc
        f = open("analysisGERP.txt", "r+")
        for i in xrange(6):
            data = cv.gen_time_split_data(allData, i)
            key = kernel+"T"+str(i)
            Analysis[key] = chooseOptimalModel(data, kernel) #perform model selection
            strToWrite = key+":   "+str(Analysis[key][0])+"   "+str(Analysis[key][1])+"   "+str(Analysis[key][2])+"\n"
            f.write(strToWrite)
        f.close()
Example #17
0
 def create_unit(self):
     info = {}
     info['faction'] = str(self.faction_select.currentText())
     faction = self.unit_data.factions[info['faction']]
     if faction:
         info['name'] = faction.unit_name
         info['faction_icon'] = faction.faction_icon
         info['desc'] = faction.desc
     info['level'] = int(self.level.value())
     info['gender'] = int(self.gender.value())
     info['klass'] = str(self.class_box.currentText())
     info['items'] = self.getItems()
     info['ai'] = self.get_ai()
     info['ai_group'] = str(self.ai_group.text())
     info['team'] = str(self.team_box.currentText())
     info['generic'] = True
     info['mode'] = self.get_modes()
     created_unit = DataImport.Unit(info)
     return created_unit
Example #18
0
 def create_unit(self):
     info = {}
     info['faction'] = str(self.faction_select.currentText())
     faction = self.unit_data.factions[info['faction']]
     if faction:
         info['name'] = faction.unit_name
         info['faction_icon'] = faction.faction_icon
         info['desc'] = faction.desc
     info['level'] = int(self.level.value())
     info['gender'] = int(self.gender.value())
     info['klass'] = str(self.class_box.currentText())
     info['items'] = self.getItems()
     info['ai'] = self.get_ai()
     info['ai_group'] = str(self.ai_group.text())
     info['pack'] = str(self.pack.text())
     info['event_id'] = EditorUtilities.next_available_event_id([rein for rein in self.unit_data.reinforcements if rein.pack == info['pack']])
     info['team'] = str(self.team_box.currentText())
     info['generic'] = True
     info['mode'] = self.get_modes()
     created_unit = DataImport.Unit(info)
     return created_unit
Example #19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import DataCleaning
import DataExploration
import DataImport
import Prediction

# Carreguem les dades
test_ds = DataImport.import_test_data()
train_ds = DataImport.import_train_data()

# Explorem la relació de les columnes amb la variable Survived
# DataExploration.show_survive_relation_by_feature(train_ds, 'Sex')
# DataExploration.show_survive_relation_by_feature(train_ds, 'Age')
# DataExploration.show_scatter_plot_by_features(train_ds, 'Fare', 'Age')
combined_ds = DataImport.combine_datasets(train_ds, test_ds)

# Fem una exploració de les dades per a veure si tenim valors nulls
DataExploration.explore_dataset(combined_ds)
combined_ds = DataExploration.get_titles(combined_ds)

# Netejem les dades
dataCleaningObj = DataCleaning.DataCleaning(combined_ds)
combined_ds = dataCleaningObj.clean()

# Test Anderson
DataExploration.anderson_darling_test(combined_ds['Age'])
DataExploration.anderson_darling_test(combined_ds['Fare'])
DataExploration.anderson_darling_test(combined_ds['Sex'])
DataExploration.anderson_darling_test(combined_ds['Family'])
Example #20
0
 def test_headDataFrame(self):
     self.fileType = 'excel'
     importer = DI.DataImport(self.testFile, self.fileType)
     importer.convertToDataframe()
     self.assertIsInstance(type(importer.headDataFrame()),
                           type(pandas.core.frame.DataFrame))
Example #21
0
 def test_correctFileImporter(self):
     self.fileType = 'excel'
     importer = DI.DataImport(self.testFile, self.fileType)
     self.assertIsNone(importer.convertToDataframe())
Example #22
0
 def test_failingFileImporter(self):
     importer = DI.DataImport(self.testFile, 'bla_filetype.plz')
     with self.assertRaises(Exception):
         importer.convertToDataframe()
Example #23
0
 def test_fullDataPath(self):
     importer = DI.DataImport(self.testFile)
     self.assertIsInstance(importer.fullDataPath(), str)
Example #24
0
 def test_checkFileExists(self):
     importer = DI.DataImport(self.testFile)
     self.assertTrue(importer.checkIFFileExists())
Example #25
0
 def test_importFileName(self):
     importer = DI.DataImport(self.testFile)
     self.assertIsInstance(importer.importFile(), str)
Example #26
0
def _to_database(typeof, metadata):
    print('------------------------------------------------')
    pprint(metadata)

    date_publ = metadata['date-publ']
    status = metadata['status']
    title = metadata['title']
    abstract = metadata['abstract']
    claims = metadata['claims']
    citations = metadata['citations']
    assignees = metadata['assignees']
    classifications = metadata['classifications']

    description = metadata['description']

    publication_reference = metadata['publication-reference']
    application_reference = metadata['application-reference']

    DataImport.create_us_document(typeof, 'publication', '',
                                  publication_reference.get('doc-number', ''),
                                  publication_reference.get('kind', ''),
                                  date_publ, status,
                                  publication_reference.get('country',
                                                            ''), title,
                                  abstract, '', '', '', claims, description)
    #DataImport.create_document(typeof, 'application', '', application_reference.get('doc-number', ''), '', date_publ, status, application_reference.get('country', ''), title, abstract, '', '', '')

    for citation in citations:
        DataImport.add_citation(publication_reference.get('doc-number', ''),
                                citation.get('doc-number', ''),
                                citation.get('country', ''),
                                citation.get('kind', ''), date_publ, '', '',
                                '')
        DataImport.add_citation(application_reference.get('doc-number', ''),
                                citation.get('doc-number', ''),
                                citation.get('country', ''),
                                citation.get('kind', ''), date_publ, '', '',
                                '')

    for assignee in assignees:
        # TODO: can we rely on assignees alone, or should we test and use parties/applicants if needed?
        DataImport.add_assignee(publication_reference.get('doc-number', ''),
                                assignee.get('orgname', ''), '', '', '',
                                date_publ, '', '', '')
        DataImport.add_assignee(application_reference.get('doc-number', ''),
                                assignee.get('orgname', ''), '', '', '',
                                date_publ, '', '', '')

    for classification in classifications:
        DataImport.add_classification(
            publication_reference.get('doc-number', ''),
            classification.get('section', ''), classification.get('class', ''),
            classification.get('subclass', ''),
            classification.get('main-group', ''),
            classification.get('subgroup', ''), date_publ, '', '', '')
        DataImport.add_classification(
            application_reference.get('doc-number', ''),
            classification.get('section', ''), classification.get('class', ''),
            classification.get('subclass', ''),
            classification.get('main-group', ''),
            classification.get('subgroup', ''), date_publ, '', '', '')

    print('------------------------------------------------')
    print('\n')
Example #27
0
 def import_halo(self, filez):
     self.halo = di.Halo(filez=filez)
def build_train_set() -> list:
    titanic_train_data = di.get_clean_train_data()
    all_X = titanic_train_data[COLUMN_NAMES]
    all_y = titanic_train_data['Survived']
    return [all_X, all_y]
Example #29
0
def create_assignee(assigned_document):
    DataImport.add_assignee(assigned_document['doc_number'], get_assignee(),
                            '12345', '?', '?', get_date(), get_date(),
                            get_date(), get_date())
    return assigned_document
Example #30
0
    def create_unit_from_legend(self, legend, mode):
        GC.U_ID += 1

        u_i = {}
        u_i['id'] = GC.U_ID
        u_i['team'] = legend['team']
        if '_' in legend['event_id']:
            u_i['pack'], u_i['event_id'] = legend['event_id'].split('_')
            u_i['event_id'] = int(u_i['event_id'])
        elif legend['event_id'] != '0':
            u_i['pack'] = legend['event_id']
            u_i['event_id'] = 1
        else:
            u_i['pack'] = None
            u_i['event_id'] = 0
        if legend['class'].endswith('F'):
            legend['class'] = legend['class'][:-1]  # strip off the F
            u_i['gender'] = 5  # Default female gender is 5
        else:
            u_i['gender'] = 0  # Default male gender is 0
        classes = legend['class'].split(',')
        u_i['klass'] = classes[-1]
        # Give default previous class
        # default_previous_classes(u_i['klass'], classes, class_dict)

        if legend['level'].startswith('f'):
            legend['level'] = legend['level'][1:]  # Remove f at the beginning
        u_i['level'] = int(
            legend['level']
        )  # Doesn't need force_fixed since it is fixed by default in LevelEditor
        u_i['position'] = tuple([
            int(num) for num in legend['position'].split(',')
        ]) if ',' in legend['position'] else None

        u_i['faction'] = legend['faction']
        faction = self.factions[u_i['faction']]
        u_i['name'] = faction.unit_name
        u_i['faction_icon'] = faction.faction_icon
        u_i['desc'] = faction.desc

        stats, u_i['growths'], u_i['growth_points'], u_i['items'], u_i['wexp'] = \
            self.get_unit_info(Data.class_dict, u_i['klass'], u_i['level'], legend['items'])
        u_i['stats'] = self.build_stat_dict(stats)

        u_i['tags'] = Data.class_dict[u_i['klass']]['tags']
        if '_' in legend['ai']:
            u_i['ai'], u_i['ai_group'] = legend['ai'].split('_')
        else:
            u_i['ai'], u_i['ai_group'] = legend['ai'], None
        u_i['movement_group'] = Data.class_dict[u_i['klass']]['movement_group']
        u_i['skills'] = []
        u_i['generic'] = True
        u_i['mode'] = mode

        cur_unit = DataImport.Unit(u_i)

        # Reposition units
        cur_unit.position = u_i['position']
        if legend['event_id'] != '0':  # Unit does not start on board
            self.reinforcements.append(cur_unit)
        else:  # Unit does start on board
            self.units.append(cur_unit)

        # Status Effects and Skills
        # get_skills(class_dict, cur_unit, classes, u_i['level'], gameStateObj, feat=False)

        # Extra Skills
        cur_unit.extra_statuses = legend['extra_status']
Example #31
0
 def import_tree(self, fname, filetype='default', keyword='', maxsnap=0,
                 filez=None):
     self.tree = di.MergerTree(fname, filetype=filetype, keyword=keyword,
                               maxsnap=maxsnap, filez=filez)
Example #32
0
def load_train_data():
    return di.load_train_data()
Example #33
0
other_gbk_folder =os.path.abspath('data/input/other/')
exp_name = datetime.datetime.now().strftime('%Y%m%d%H%M')

mongo_init(exp_name)
reset_database(exp_name)    

new_folder = 'data/output/{0}/'.format(exp_name)
if not os.path.isdir(new_folder):
   os.makedirs(new_folder)
os.chdir(new_folder)

print 'Checking and importing core genomes...'

for the_file in os.listdir(core_gbk_folder):
    path_to_file = '{0}/{1}'.format(core_gbk_folder, the_file)
    DataImport.import_file(path_to_file, 'core')

print 'Checking and importing other genomes...'

for the_file in os.listdir(other_gbk_folder):
    path_to_file = '{0}/{1}'.format(other_gbk_folder, the_file)
    DataImport.import_file(path_to_file, 'other')
    

KvasirBlast.make_blast_db('core')
KvasirBlast.make_blast_db('other')
KvasirBlast.core_hgt_blast(perc_identity='90')
# KvasirBlast.core_hgt_blast(perc_identity='95')
# KvasirBlast.core_hgt_blast(perc_identity='99')
KvasirBlast.blast_to_db(perc_identity='90')
# KvasirBlast.blast_to_db(perc_identity='95')