def create_citation(citer_document, cited_document): print('creating citation: ' + citer_document['doc_number'] + '->' + cited_document['doc_number']) DataImport.add_citation( citer_document['doc_number'], cited_document['doc_number'], citer_document['country'], citer_document['kind'], citer_document['datepublished'], citer_document['filedate'], citer_document['issuedate'], citer_document['prioritydate'])
def get_lr_model_test_accuracy() -> float: titanic_test_set = di.get_clean_test_data() titanic_test_X = titanic_test_set[COLUMN_NAMES] titanic_test_y = di.get_titanic_test_results()['Survived'] lr_model = get_lr_model() predictions = lr_model.predict(titanic_test_X) save_results_in_csv(predictions, titanic_test_set) return accuracy_score(titanic_test_y, predictions)
def plot_survival_by_age_category(): train_data = di.get_titanic_data() train_data = di.filter_age(train_data) age_categories_pivot = train_data.pivot_table(index="Age_category", values="Survived") age_categories_pivot.plot.bar() plt.title("Graph to Show Survival Rate by Age Category") plt.xlabel("Age Group") plt.ylabel("Survival Rate") plt.savefig("Graph to Show Survival Rate by Age Category")
def to_database(metadata): print('------------------------------------------------') pprint(metadata) dnum = metadata['dnum'] doc_number = metadata['doc-number'] kind = metadata['kind'] date_publ = metadata['date-publ'] status = metadata['status'] country = metadata['country'] abstract = metadata['abstract'] date_file = metadata['date-file'] date_issue = metadata['date-issue'] date_priority = metadata['date-priority'] citations = metadata['citations'] classifications = metadata['classifications'] assignees = metadata['assignees'] DataImport.create_document('FullText', '?', dnum, doc_number, kind, date_publ, status, country, '', abstract, date_file, date_issue, date_priority) for citation in citations: DataImport.add_citation(doc_number, citation.get('doc-number', ''), citation.get( 'country', ''), citation.get('kind', ''), date_publ, date_file, date_issue, date_priority) for classification in classifications: DataImport.add_classification(doc_number, classification.get('section', ''), classification.get('class', ''), classification.get( 'subclass', ''), classification.get('main-group', ''), classification.get('subgroup', ''), date_publ, date_file, date_issue, date_priority) for assignee in assignees: DataImport.add_assignee(doc_number, assignee.get('name', ''), assignee.get('epo-number', ''), assignee.get( 'reference', ''), assignee.get('cross-reference', ''), date_publ, date_file, date_issue, date_priority) print('------------------------------------------------') print('\n')
def get_train_test_comment_classes(): toxic_comments_train_set = di.load_train_data() train_idx, test_idx = split_data(toxic_comments_train_set) train_classes = toxic_comments_train_set[di.CATEGORIES].loc[train_idx] test_classes = toxic_comments_train_set[di.CATEGORIES].loc[test_idx] return train_classes, test_classes
def main(): """ Data import and analysis is performed using the two kernels. The analysis is stored in a text file.""" wb = xl.open_workbook(sys.argv[1]) #retrice dataset from the excel file S = wb.sheet_by_index(0) #dataset sheet allData = DI.get_patient_ts( S, 7, 6) #Convert import data to our format data structure allData = DI.remove_missing_tp( allData, S, 7, 6) #remove missing time points from the time series data normalizeValues(allData) #normalize data kernel = sys.argv[2] #Analysis for TWED kernel if kernel == "twed": Analysis = { } #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc,Nu and Lambda f = open("analysisTWED.txt", "r+") for i in xrange(6): data = cv.gen_time_split_data(allData, i) #split the time series data key = kernel + "T" + str(i) Analysis[key] = chooseOptimalModel( data, kernel) #perform model selection strToWrite = key + ": " + str(Analysis[key][0]) + " " + str( Analysis[key][1]) + " " + str( Analysis[key][2]) + " " + str( Analysis[key][3]) + " " + str( Analysis[key][4]) + "\n" f.write(strToWrite) f.close() else: #Analysis for GERP kernel Analysis = { } #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc f = open("analysisGERP.txt", "r+") for i in xrange(6): data = cv.gen_time_split_data(allData, i) key = kernel + "T" + str(i) Analysis[key] = chooseOptimalModel( data, kernel) #perform model selection strToWrite = key + ": " + str(Analysis[key][0]) + " " + str( Analysis[key][1]) + " " + str(Analysis[key][2]) + "\n" f.write(strToWrite) f.close()
def plot_survival_by_class(): train_data = di.get_titanic_data() pclass_table = train_data.pivot_table(index="Pclass", values="Survived") pclass_table.plot.bar() plt.title("Graph to Show Survival Rate by Class") plt.xlabel("Class") plt.ylabel("Survival Rate") plt.savefig("Graph to Show Survival Rate by Class")
def get_train_test_comment_vectors(): toxic_comments_train_set = di.load_train_data() comment_vectors = vc.get_comment_vectors(toxic_comments_train_set) train_idx, test_idx = split_data(toxic_comments_train_set) train_vectors = comment_vectors[train_idx] test_vectors = comment_vectors[test_idx] return train_vectors, test_vectors
def plot_survival_by_gender(): train_data = di.get_titanic_data() gender_table = train_data.pivot_table(index="Sex", values="Survived") gender_table.plot.bar() plt.title("Graph to Show Survival Rate by Gender") plt.xlabel("Gender") plt.ylabel("Survival Rate") plt.savefig("Graph to Show Survival Rate by Gender")
def plot_feature_heatmap(): train_data = di.get_titanic_data() plt.figure(figsize=(14, 12)) correlation_matrix = train_data.corr(method='spearman') sns.heatmap(correlation_matrix, square=True, annot=True) plt.title("Heatmap of Correlation of Different Features", size=24) plt.xlabel("Feature", size=20) plt.ylabel("Feature", size=20) plt.savefig("Heatmap of Correlation of Different Features")
def create_document(): kind = get_kind() doc_number = get_doc_number() country = get_country() dnum = country + doc_number + kind print('creating document: ' + doc_number) document = DataImport.create_document('Test', '?', dnum, doc_number, kind, get_date(), 'n', country, '', '', get_date(), get_date(), get_date()) return document.data()[0]['doc'].properties
def plot_survival_by_age(): train_data = di.get_titanic_data() survived = train_data[train_data["Survived"] == 1] died = train_data[train_data["Survived"] == 0] survived["Age"].plot.hist(alpha=0.5, color="green", bins=50) died["Age"].plot.hist(alpha=0.5, color="red", bins=50) plt.title("Graph to Show Survival Rate by Age") plt.xlabel("Age (years)") plt.ylabel("Survival Rate") plt.legend(["Survived", "Died"]) plt.savefig("Graph to Show Survival Rate by Age")
def plot_survival_by_fare(): train_data = di.get_titanic_data() plt.figure(figsize=(15, 8)) died_fare = train_data.loc[(train_data["Survived"] == 0), "Fare"] survived_fare = train_data.loc[(train_data["Survived"] == 1), "Fare"] sns.kdeplot(died_fare, color="gray", shade=True, label="Not Survived") sns.kdeplot(survived_fare, color="g", shade=True, label="Survived") plt.title("Graph to Show Survival Rate by Fare") plt.xlabel("Fare") plt.ylabel("Frequency of Passengers") plt.savefig("Graph to Show Survival Rate by Fare")
def main(): wb = xl.open_workbook("Dataset_S1.xls") S = wb.sheet_by_index(0) allData = DI.get_patient_ts(S, 7, 6) #for (label, time, patient,ts) in allData: # if(patient == str(1185163.0)): # print ts allData = DI.remove_missing_tp(allData, S, 7, 6) #for (label, time, patient,ts) in allData: #if(patient == str(1185163.0)): #print ts #print allData[0] seg_data = cv.gen_time_split_data(allData, 4, 2, 5) for val in seg_data: (train, test) = val print("This is new iteration") for (label, time, patient, ts) in train: print label, time, patient, len(ts) print("This is new dataset") for (label, time, patient, ts) in test: print label, time, patient, len(ts)
def main(): wb = xl.open_workbook("Dataset_S1.xls") S = wb.sheet_by_index(0) allData = DI.get_patient_ts(S, 7, 6) #for (label, time, patient,ts) in allData: # if(patient == str(1185163.0)): # print ts allData = DI.remove_missing_tp(allData, S, 7, 6) #for (label, time, patient,ts) in allData: #if(patient == str(1185163.0)): #print ts #print allData[0] seg_data = cv.gen_time_split_data(allData, 4, 2, 5) for val in seg_data: (train, test) = val print ("This is new iteration") for(label, time, patient, ts) in train: print label, time, patient, len(ts) print("This is new dataset") for(label, time, patient, ts) in test: print label, time, patient, len(ts)
def main(): """ Data import and analysis is performed using the two kernels. The analysis is stored in a text file.""" wb = xl.open_workbook(sys.argv[1]) #retrice dataset from the excel file S = wb.sheet_by_index(0) #dataset sheet allData = DI.get_patient_ts(S, 7, 6) #Convert import data to our format data structure allData = DI.remove_missing_tp(allData, S, 7, 6) #remove missing time points from the time series data normalizeValues(allData) #normalize data kernel = sys.argv[2] #Analysis for TWED kernel if kernel == "twed": Analysis = {} #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc,Nu and Lambda f = open("analysisTWED.txt", "r+") for i in xrange(6): data = cv.gen_time_split_data(allData, i) #split the time series data key = kernel+"T"+str(i) Analysis[key] = chooseOptimalModel(data, kernel) #perform model selection strToWrite = key+": "+str(Analysis[key][0])+" "+str(Analysis[key][1])+" "+str(Analysis[key][2])+" "+str(Analysis[key][3])+" "+str(Analysis[key][4])+"\n" f.write(strToWrite) f.close() else: #Analysis for GERP kernel Analysis = {} #Pass the key in format "twedt1". Returns the most opimal parameters as a tuple of C,S,acc f = open("analysisGERP.txt", "r+") for i in xrange(6): data = cv.gen_time_split_data(allData, i) key = kernel+"T"+str(i) Analysis[key] = chooseOptimalModel(data, kernel) #perform model selection strToWrite = key+": "+str(Analysis[key][0])+" "+str(Analysis[key][1])+" "+str(Analysis[key][2])+"\n" f.write(strToWrite) f.close()
def create_unit(self): info = {} info['faction'] = str(self.faction_select.currentText()) faction = self.unit_data.factions[info['faction']] if faction: info['name'] = faction.unit_name info['faction_icon'] = faction.faction_icon info['desc'] = faction.desc info['level'] = int(self.level.value()) info['gender'] = int(self.gender.value()) info['klass'] = str(self.class_box.currentText()) info['items'] = self.getItems() info['ai'] = self.get_ai() info['ai_group'] = str(self.ai_group.text()) info['team'] = str(self.team_box.currentText()) info['generic'] = True info['mode'] = self.get_modes() created_unit = DataImport.Unit(info) return created_unit
def create_unit(self): info = {} info['faction'] = str(self.faction_select.currentText()) faction = self.unit_data.factions[info['faction']] if faction: info['name'] = faction.unit_name info['faction_icon'] = faction.faction_icon info['desc'] = faction.desc info['level'] = int(self.level.value()) info['gender'] = int(self.gender.value()) info['klass'] = str(self.class_box.currentText()) info['items'] = self.getItems() info['ai'] = self.get_ai() info['ai_group'] = str(self.ai_group.text()) info['pack'] = str(self.pack.text()) info['event_id'] = EditorUtilities.next_available_event_id([rein for rein in self.unit_data.reinforcements if rein.pack == info['pack']]) info['team'] = str(self.team_box.currentText()) info['generic'] = True info['mode'] = self.get_modes() created_unit = DataImport.Unit(info) return created_unit
#!/usr/bin/env python # -*- coding: utf-8 -*- import DataCleaning import DataExploration import DataImport import Prediction # Carreguem les dades test_ds = DataImport.import_test_data() train_ds = DataImport.import_train_data() # Explorem la relació de les columnes amb la variable Survived # DataExploration.show_survive_relation_by_feature(train_ds, 'Sex') # DataExploration.show_survive_relation_by_feature(train_ds, 'Age') # DataExploration.show_scatter_plot_by_features(train_ds, 'Fare', 'Age') combined_ds = DataImport.combine_datasets(train_ds, test_ds) # Fem una exploració de les dades per a veure si tenim valors nulls DataExploration.explore_dataset(combined_ds) combined_ds = DataExploration.get_titles(combined_ds) # Netejem les dades dataCleaningObj = DataCleaning.DataCleaning(combined_ds) combined_ds = dataCleaningObj.clean() # Test Anderson DataExploration.anderson_darling_test(combined_ds['Age']) DataExploration.anderson_darling_test(combined_ds['Fare']) DataExploration.anderson_darling_test(combined_ds['Sex']) DataExploration.anderson_darling_test(combined_ds['Family'])
def test_headDataFrame(self): self.fileType = 'excel' importer = DI.DataImport(self.testFile, self.fileType) importer.convertToDataframe() self.assertIsInstance(type(importer.headDataFrame()), type(pandas.core.frame.DataFrame))
def test_correctFileImporter(self): self.fileType = 'excel' importer = DI.DataImport(self.testFile, self.fileType) self.assertIsNone(importer.convertToDataframe())
def test_failingFileImporter(self): importer = DI.DataImport(self.testFile, 'bla_filetype.plz') with self.assertRaises(Exception): importer.convertToDataframe()
def test_fullDataPath(self): importer = DI.DataImport(self.testFile) self.assertIsInstance(importer.fullDataPath(), str)
def test_checkFileExists(self): importer = DI.DataImport(self.testFile) self.assertTrue(importer.checkIFFileExists())
def test_importFileName(self): importer = DI.DataImport(self.testFile) self.assertIsInstance(importer.importFile(), str)
def _to_database(typeof, metadata): print('------------------------------------------------') pprint(metadata) date_publ = metadata['date-publ'] status = metadata['status'] title = metadata['title'] abstract = metadata['abstract'] claims = metadata['claims'] citations = metadata['citations'] assignees = metadata['assignees'] classifications = metadata['classifications'] description = metadata['description'] publication_reference = metadata['publication-reference'] application_reference = metadata['application-reference'] DataImport.create_us_document(typeof, 'publication', '', publication_reference.get('doc-number', ''), publication_reference.get('kind', ''), date_publ, status, publication_reference.get('country', ''), title, abstract, '', '', '', claims, description) #DataImport.create_document(typeof, 'application', '', application_reference.get('doc-number', ''), '', date_publ, status, application_reference.get('country', ''), title, abstract, '', '', '') for citation in citations: DataImport.add_citation(publication_reference.get('doc-number', ''), citation.get('doc-number', ''), citation.get('country', ''), citation.get('kind', ''), date_publ, '', '', '') DataImport.add_citation(application_reference.get('doc-number', ''), citation.get('doc-number', ''), citation.get('country', ''), citation.get('kind', ''), date_publ, '', '', '') for assignee in assignees: # TODO: can we rely on assignees alone, or should we test and use parties/applicants if needed? DataImport.add_assignee(publication_reference.get('doc-number', ''), assignee.get('orgname', ''), '', '', '', date_publ, '', '', '') DataImport.add_assignee(application_reference.get('doc-number', ''), assignee.get('orgname', ''), '', '', '', date_publ, '', '', '') for classification in classifications: DataImport.add_classification( publication_reference.get('doc-number', ''), classification.get('section', ''), classification.get('class', ''), classification.get('subclass', ''), classification.get('main-group', ''), classification.get('subgroup', ''), date_publ, '', '', '') DataImport.add_classification( application_reference.get('doc-number', ''), classification.get('section', ''), classification.get('class', ''), classification.get('subclass', ''), classification.get('main-group', ''), classification.get('subgroup', ''), date_publ, '', '', '') print('------------------------------------------------') print('\n')
def import_halo(self, filez): self.halo = di.Halo(filez=filez)
def build_train_set() -> list: titanic_train_data = di.get_clean_train_data() all_X = titanic_train_data[COLUMN_NAMES] all_y = titanic_train_data['Survived'] return [all_X, all_y]
def create_assignee(assigned_document): DataImport.add_assignee(assigned_document['doc_number'], get_assignee(), '12345', '?', '?', get_date(), get_date(), get_date(), get_date()) return assigned_document
def create_unit_from_legend(self, legend, mode): GC.U_ID += 1 u_i = {} u_i['id'] = GC.U_ID u_i['team'] = legend['team'] if '_' in legend['event_id']: u_i['pack'], u_i['event_id'] = legend['event_id'].split('_') u_i['event_id'] = int(u_i['event_id']) elif legend['event_id'] != '0': u_i['pack'] = legend['event_id'] u_i['event_id'] = 1 else: u_i['pack'] = None u_i['event_id'] = 0 if legend['class'].endswith('F'): legend['class'] = legend['class'][:-1] # strip off the F u_i['gender'] = 5 # Default female gender is 5 else: u_i['gender'] = 0 # Default male gender is 0 classes = legend['class'].split(',') u_i['klass'] = classes[-1] # Give default previous class # default_previous_classes(u_i['klass'], classes, class_dict) if legend['level'].startswith('f'): legend['level'] = legend['level'][1:] # Remove f at the beginning u_i['level'] = int( legend['level'] ) # Doesn't need force_fixed since it is fixed by default in LevelEditor u_i['position'] = tuple([ int(num) for num in legend['position'].split(',') ]) if ',' in legend['position'] else None u_i['faction'] = legend['faction'] faction = self.factions[u_i['faction']] u_i['name'] = faction.unit_name u_i['faction_icon'] = faction.faction_icon u_i['desc'] = faction.desc stats, u_i['growths'], u_i['growth_points'], u_i['items'], u_i['wexp'] = \ self.get_unit_info(Data.class_dict, u_i['klass'], u_i['level'], legend['items']) u_i['stats'] = self.build_stat_dict(stats) u_i['tags'] = Data.class_dict[u_i['klass']]['tags'] if '_' in legend['ai']: u_i['ai'], u_i['ai_group'] = legend['ai'].split('_') else: u_i['ai'], u_i['ai_group'] = legend['ai'], None u_i['movement_group'] = Data.class_dict[u_i['klass']]['movement_group'] u_i['skills'] = [] u_i['generic'] = True u_i['mode'] = mode cur_unit = DataImport.Unit(u_i) # Reposition units cur_unit.position = u_i['position'] if legend['event_id'] != '0': # Unit does not start on board self.reinforcements.append(cur_unit) else: # Unit does start on board self.units.append(cur_unit) # Status Effects and Skills # get_skills(class_dict, cur_unit, classes, u_i['level'], gameStateObj, feat=False) # Extra Skills cur_unit.extra_statuses = legend['extra_status']
def import_tree(self, fname, filetype='default', keyword='', maxsnap=0, filez=None): self.tree = di.MergerTree(fname, filetype=filetype, keyword=keyword, maxsnap=maxsnap, filez=filez)
def load_train_data(): return di.load_train_data()
other_gbk_folder =os.path.abspath('data/input/other/') exp_name = datetime.datetime.now().strftime('%Y%m%d%H%M') mongo_init(exp_name) reset_database(exp_name) new_folder = 'data/output/{0}/'.format(exp_name) if not os.path.isdir(new_folder): os.makedirs(new_folder) os.chdir(new_folder) print 'Checking and importing core genomes...' for the_file in os.listdir(core_gbk_folder): path_to_file = '{0}/{1}'.format(core_gbk_folder, the_file) DataImport.import_file(path_to_file, 'core') print 'Checking and importing other genomes...' for the_file in os.listdir(other_gbk_folder): path_to_file = '{0}/{1}'.format(other_gbk_folder, the_file) DataImport.import_file(path_to_file, 'other') KvasirBlast.make_blast_db('core') KvasirBlast.make_blast_db('other') KvasirBlast.core_hgt_blast(perc_identity='90') # KvasirBlast.core_hgt_blast(perc_identity='95') # KvasirBlast.core_hgt_blast(perc_identity='99') KvasirBlast.blast_to_db(perc_identity='90') # KvasirBlast.blast_to_db(perc_identity='95')