def get_versioned_tables(first, last): global errors errors = [] if not first in schema_remarks.version_order: raise error, "I don't know about version '%s'." % last if not last in schema_remarks.version_order: raise error, "I don't know about version '%s'." % last if not (schema_remarks.version_order.index(last) >= schema_remarks.version_order.index(first)): raise error, "Version '%s' comes before version '%s'." % (last, first) colours = {} tr = {} schema_name = schema_remarks.version_schema_map[first] schema, errors = get_schema.get_schema(schema_name, errors) # turn fields into lists connecting Bugzilla version to value pair_up_schema(first, schema) schemas = [(first, schema)] bugzilla_versions = schema_remarks.version_order[(schema_remarks.version_order.index(first)) : (schema_remarks.version_order.index(last)+1)] for bz_name in bugzilla_versions[1:]: new_schema_name = schema_remarks.version_schema_map[bz_name] if new_schema_name == schema_name: continue schema_name = new_schema_name new_schema, errors = get_schema.get_schema(schema_name, errors) pair_up_schema(bz_name, new_schema) schemas.append((bz_name, new_schema)) schema = make_versioned_schema(schemas, colours, tr) stringify_schema(schema) return (schema, tr, colours, tuple(bugzilla_versions), errors)
def main(load_directory, loader_id, id_class): print(datetime.datetime.now().strftime("%H:%M:%S")) path = '../data/tblOS_GLOBAL_GLOBAL_Ref_TCodes.csv' #Create dataframe df = extract_file(path) tcode_disease_df = parse_tcode_main(df, load_directory, loader_id, id_class) path = load_directory + 'tcode_diseases.csv' write_load_files.main(tcode_disease_df, path) tcode_parents_df = parse_tcode_parents(df) tcode_parents_df = combine_parents_and_children(tcode_parents_df, 'parent') path_parents = load_directory + 'tcode_parents.csv' write_load_files.main(tcode_parents_df, path_parents) tcode_children_df = parse_tcode_children(df) tcode_children_df = combine_parents_and_children(tcode_children_df, 'child') path_children = load_directory + 'tcode_children.csv' write_load_files.main(tcode_children_df, path_children) db_dict = get_schema.get_schema('tcode_diseases') db_parents_dict = get_schema.get_schema('tcode_parents') db_children_dict = get_schema.get_schema('tcode_children') write_sql.write_sql(db_dict, 'tcode_diseases') write_sql.write_sql(db_parents_dict, 'tcode_parents') write_sql.write_sql(db_children_dict, 'tcode_children')
def get_versioned_tables(first, last): global errors errors = [] if not first in schema_remarks.version_order: raise error, "I don't know about version '%s'." % last if not last in schema_remarks.version_order: raise error, "I don't know about version '%s'." % last if not (schema_remarks.version_order.index(last) >= schema_remarks.version_order.index(first)): raise error, "Version '%s' comes before version '%s'." % (last, first) colours = {} tr = {} schema_name = schema_remarks.version_schema_map[first] schema, errors = get_schema.get_schema(schema_name, errors) # turn fields into lists connecting Bugzilla version to value pair_up_schema(first, schema) schemas = [(first, schema)] bugzilla_versions = schema_remarks.version_order[( schema_remarks.version_order.index(first)):( schema_remarks.version_order.index(last) + 1)] for bz_name in bugzilla_versions[1:]: new_schema_name = schema_remarks.version_schema_map[bz_name] if new_schema_name == schema_name: continue schema_name = new_schema_name new_schema, errors = get_schema.get_schema(schema_name, errors) pair_up_schema(bz_name, new_schema) schemas.append((bz_name, new_schema)) schema = make_versioned_schema(schemas, colours, tr) stringify_schema(schema) return (schema, tr, colours, tuple(bugzilla_versions), errors)
def assign_editable_lists(df, loader_id, load_dir, id_class, element): # Gets the info on what columns should be created db_dict_ed_str_list = get_schema.get_schema('EditableStringList') db_dict_ed_str_el= get_schema.get_schema('EditableStringListElements') # Creates writer objects for EditableSynonymsList and Synonym load_files_ed_str_list_dict = create_load_files_dict(db_dict_ed_str_list, load_dir) editable_string_writer = load_files_ed_str_list_dict['EditableStringList']['writer'] load_ed_el_syn_dict = create_load_files_dict(db_dict_ed_str_el, load_dir) element_writer = load_ed_el_syn_dict['EditableStringListElements']['writer'] # create a list of synonyms for each graph id strings_dict = {} for index, row in df.iterrows(): graph_id = row['graph_id'] if graph_id in strings_dict: strings = strings_dict[graph_id] if type(row[element]) is list: for syn in row[element]: strings.append(syn) else: strings.append( row[element]) strings_dict[graph_id] = strings else: strings = [] if type(row[element]) is list: for syn in row[element]: strings.append(syn) else: strings.append( row[element]) strings_dict[graph_id] = strings esl_dict = {} counter = 0 for entry in strings_dict: entries = strings_dict[entry] #EditableSynonymsList_graph_id: esl = id_class.assign_id().replace('es_', 'esl_') esl_dict[entry] = esl graph_id = element + '_ ' + entry # Write editable synonyms list csv file entry write_editable_string_list(graph_id, editable_string_writer, loader_id, esl) pipe_strings = '|'.join(entries) # Write synonym csv file entry write_editable_list_elements( element_writer, pipe_strings, esl) return esl_dict
def assign_editable_boolean(df, editable_statement_list, loader_id, load_dir, table_name, id_class): # Gets the info on what columns should be created db_dict = get_schema.get_schema('EditableBoolean') # Creates writer object load_files_dict = create_load_files_dict(db_dict, load_dir) editable_statement_writer = load_files_dict['EditableBoolean']['writer'] column_list = list(df) data = df.T.to_dict().values() # Go over data for input in data: for entry in column_list: # Check if the column value contains editable statement if entry in editable_statement_list: graph_id = input['graph_id'] field = entry # Create field in editable statement table field = table_name + '_' + field.capitalize() + '_' + graph_id statement = str(input[entry]) if not input[entry]: statement = "False" elif pandas.isnull(statement): statement = "False" # Assign unique value to editable statement and write it es_des_id = write_editable_statement( field, editable_statement_writer, loader_id, statement, id_class) # Put this value in the original table input[entry] = es_des_id # Create new dataframe with editable statement ids df2 = pandas.DataFrame(data) return df2
def main(load_directory, loader_id, id_class): print(datetime.datetime.now().strftime("%H:%M:%S")) files = ['../data/GO_diseases/diseases_pg1.json', '../data/GO_diseases/diseases_pg2.json', '../data/GO_diseases/diseases_pg3.json', '../data/GO_diseases/diseases_pg4.json', '../data/GO_diseases/diseases_pg5.json', '../data/GO_diseases/diseases_pg6.json'] #Create dataframe df_with_none=combine_files(files) df = df_with_none.fillna("") #Parse dataframes go_disease_df= parse_go_main(df, loader_id, load_directory, id_class) go_parents_df = parse_go_parents(df) go_parents_df = combine_parents_and_children(go_parents_df, 'parent') go_children_df = parse_go_children(df) go_children_df = combine_parents_and_children(go_children_df, 'child') go_xrefs = parse_go_refs(go_disease_df) xrefs_editable_dict = create_EditableXrefsList.assign_editable_xrefs_lists(go_xrefs, loader_id, load_directory, id_class) df_xrefs_editable = add_dict(go_xrefs, xrefs_editable_dict, 'xrefs') syn_editable_dict = create_EditableStringList.assign_editable_lists(df_xrefs_editable, loader_id, load_directory, id_class, 'synonyms') df_syn_editable= add_dict(df_xrefs_editable, syn_editable_dict, 'synonyms') df_syn_editable.to_csv(load_directory + 'go_diseases.csv', index=False) go_parents_df.to_csv(load_directory + 'go_parents.csv', index=False) go_children_df.to_csv(load_directory + 'go_children.csv', index=False) # Write sql tables db_dict = get_schema.get_schema('go_diseases') db_parents_dict = get_schema.get_schema('go_parents') db_children_dict = get_schema.get_schema('go_children') write_sql.write_sql(db_dict, 'go_diseases') write_sql.write_sql(db_parents_dict, 'go_parents') write_sql.write_sql(db_children_dict, 'go_children') print('go diseases are extracted')
def main(load_directory, loader_id, id_class): print(datetime.datetime.now().strftime("%H:%M:%S")) file = ('../data/onctoree.json') #Convert json to dataframe df = pandas.read_json(file) #Parse dataframes oncotree_df = parse_oncotree_main(df, load_directory, loader_id, id_class) xrefs_editable = create_EditableXrefsList.assign_editable_xrefs_lists( oncotree_df, loader_id, load_directory, id_class) oncotree_disease_with_xrefs = add_column_to_dataframe( oncotree_df, xrefs_editable, 'xrefs') oncotree_df = oncotree_disease_with_xrefs[[ 'code', 'name', 'mainType', 'tissue', 'xrefs', 'graph_id' ]] path = load_directory + 'oncotree_diseases.csv' write_load_files.main(oncotree_df, path) oncotree_df_parent = parse_oncotree_parents(df) oncotree_df_parent = combine_parents_and_children(oncotree_df_parent, 'parent') path_parents = load_directory + 'oncotree_parents.csv' write_load_files.main(oncotree_df_parent, path_parents) oncotree_df_children = reverse_parent_child(oncotree_df_parent) path_children = load_directory + 'oncotree_children.csv' write_load_files.main(oncotree_df_children, path_children) # Write sql tables db_dict = get_schema.get_schema('oncotree_diseases') db_dict_parents = get_schema.get_schema('oncotree_parents') db_dict_children = get_schema.get_schema('oncotree_children') write_sql.write_sql(db_dict, 'oncotree_diseases') write_sql.write_sql(db_dict_parents, 'oncotree_parents') write_sql.write_sql(db_dict_children, 'oncotree_children') print(datetime.datetime.now().strftime("%H:%M:%S"))
def main(load_directory, loader_id, id_class): print(datetime.datetime.now().strftime("%H:%M:%S")) path = '../data/tblOS_GLOBAL_GLOBAL_Ref_OmniDiseases.csv' #Create dataframe df = extract_file(path) df2 = pandas.DataFrame([['OmniDx_blank', 'None', 'None', 'Other']], columns=[ 'OmniDiseaseID', 'OmniDisease', 'OmniDiseaseName', 'OmniDiseaseType' ]) df = pandas.concat([df, df2], ignore_index=True) omni_disease_df = parse_omnidisease(df, load_directory, loader_id, id_class) path = load_directory + 'omni_diseases.csv' write_load_files.main(omni_disease_df, path) db_dict = get_schema.get_schema('omni_diseases') write_sql.write_sql(db_dict, 'omni_diseases')
def main(load_directory, loader_id, id_class): print(datetime.datetime.now().strftime("%H:%M:%S")) my_db = None my_cursor = None print('WARNING: skipping DiseaseOntology download during development') #download_do() doid_dict, doid_child_dict = parse_do() #db_dict = get_schema_original() # creates a dataframe from doid_dict do_df = create_dataframe(doid_dict) # replaces values with editable statements df_editable = create_editable_statement.assign_editable_statement( do_df, editable_statement_list, loader_id, load_directory, do_table_name, id_class) # creates a dictionary with ids for exact synonyms exact_dict = create_EditableStringList.assign_editable_lists( df_editable, loader_id, load_directory, id_class, 'exact_synonyms') # replaces exact synonyms with assigned ids in a dataframe df_exact = add_dict(df_editable, exact_dict, 'exact_synonyms') # creates a dictionary with ids for related synonyms related_dict = create_EditableStringList.assign_editable_lists( df_editable, loader_id, load_directory, id_class, 'related_synonyms') # replaces related synonyms with assigned ids in a dataframe df_related = add_dict(df_exact, related_dict, 'related_synonyms') # creates a dictionary with ids for narrow synonyms narrow_dict = create_EditableStringList.assign_editable_lists( df_related, loader_id, load_directory, id_class, 'narrow_synonyms') # replaces narrow synonyms with assigned ids in a dataframe df_narrow = add_dict(df_related, narrow_dict, 'narrow_synonyms') # creates a dictionary with ids for subsets subset_dict = create_EditableStringList.assign_editable_lists( df_narrow, loader_id, load_directory, id_class, 'subset') # replaces subsets with assigned ids in a dataframe df_subset = add_dict(df_narrow, subset_dict, 'subset') # creates a dictionary with ids for xrefs xref_dict = create_EditableXrefsList.assign_editable_xrefs_lists( df_subset, loader_id, load_directory, id_class) # replaces xrefs with assigned ids in a dataframe df_xref = add_dict(df_subset, xref_dict, 'xrefs') del df_xref['reference'] df_xref.to_csv(load_directory + 'do_diseases.csv', index=False) # creates dataframe of children children_df = add_children(df_xref, doid_child_dict) children_df.to_csv(load_directory + 'do_children.csv', index=False) # creates dataframe of parents parents_df = add_parents(df_xref, doid_child_dict) parents_df.to_csv(load_directory + 'do_parents.csv', index=False) create_references.main(do_df) db_dict = get_schema.get_schema('do_diseases') write_sql.write_sql(db_dict, 'do_diseases') db_dict = get_schema.get_schema('do_parents') write_sql.write_sql(db_dict, 'do_parents') db_dict = get_schema.get_schema('do_children') write_sql.write_sql(db_dict, 'do_children') print('do diseases are extracted')