def get_versioned_tables(first, last):
    global errors
    errors = []
    if not first in schema_remarks.version_order:
        raise error, "I don't know about version '%s'." % last
    if not last in schema_remarks.version_order:
        raise error, "I don't know about version '%s'." % last
    if not (schema_remarks.version_order.index(last) >= schema_remarks.version_order.index(first)):
        raise error, "Version '%s' comes before version '%s'." % (last, first)
    colours = {}
    tr = {}
    schema_name = schema_remarks.version_schema_map[first]
    schema, errors = get_schema.get_schema(schema_name, errors)
    # turn fields into lists connecting Bugzilla version to value
    pair_up_schema(first, schema)
    schemas = [(first, schema)]
    bugzilla_versions = schema_remarks.version_order[(schema_remarks.version_order.index(first)) : (schema_remarks.version_order.index(last)+1)]
    for bz_name in bugzilla_versions[1:]:
        new_schema_name = schema_remarks.version_schema_map[bz_name]
        if new_schema_name == schema_name:
            continue
        schema_name = new_schema_name
        new_schema, errors = get_schema.get_schema(schema_name, errors)
        pair_up_schema(bz_name, new_schema)
        schemas.append((bz_name, new_schema))
    schema = make_versioned_schema(schemas,
                                   colours,
                                   tr)
    stringify_schema(schema)
    return (schema, tr, colours, tuple(bugzilla_versions), errors)
def main(load_directory, loader_id, id_class):
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    path = '../data/tblOS_GLOBAL_GLOBAL_Ref_TCodes.csv'

    #Create dataframe
    df = extract_file(path)

    tcode_disease_df = parse_tcode_main(df, load_directory, loader_id,
                                        id_class)
    path = load_directory + 'tcode_diseases.csv'
    write_load_files.main(tcode_disease_df, path)

    tcode_parents_df = parse_tcode_parents(df)
    tcode_parents_df = combine_parents_and_children(tcode_parents_df, 'parent')
    path_parents = load_directory + 'tcode_parents.csv'
    write_load_files.main(tcode_parents_df, path_parents)

    tcode_children_df = parse_tcode_children(df)
    tcode_children_df = combine_parents_and_children(tcode_children_df,
                                                     'child')
    path_children = load_directory + 'tcode_children.csv'
    write_load_files.main(tcode_children_df, path_children)

    db_dict = get_schema.get_schema('tcode_diseases')
    db_parents_dict = get_schema.get_schema('tcode_parents')
    db_children_dict = get_schema.get_schema('tcode_children')

    write_sql.write_sql(db_dict, 'tcode_diseases')
    write_sql.write_sql(db_parents_dict, 'tcode_parents')
    write_sql.write_sql(db_children_dict, 'tcode_children')
def get_versioned_tables(first, last):
    global errors
    errors = []
    if not first in schema_remarks.version_order:
        raise error, "I don't know about version '%s'." % last
    if not last in schema_remarks.version_order:
        raise error, "I don't know about version '%s'." % last
    if not (schema_remarks.version_order.index(last) >=
            schema_remarks.version_order.index(first)):
        raise error, "Version '%s' comes before version '%s'." % (last, first)
    colours = {}
    tr = {}
    schema_name = schema_remarks.version_schema_map[first]
    schema, errors = get_schema.get_schema(schema_name, errors)
    # turn fields into lists connecting Bugzilla version to value
    pair_up_schema(first, schema)
    schemas = [(first, schema)]
    bugzilla_versions = schema_remarks.version_order[(
        schema_remarks.version_order.index(first)):(
            schema_remarks.version_order.index(last) + 1)]
    for bz_name in bugzilla_versions[1:]:
        new_schema_name = schema_remarks.version_schema_map[bz_name]
        if new_schema_name == schema_name:
            continue
        schema_name = new_schema_name
        new_schema, errors = get_schema.get_schema(schema_name, errors)
        pair_up_schema(bz_name, new_schema)
        schemas.append((bz_name, new_schema))
    schema = make_versioned_schema(schemas, colours, tr)
    stringify_schema(schema)
    return (schema, tr, colours, tuple(bugzilla_versions), errors)
def assign_editable_lists(df, loader_id, load_dir,  id_class, element):

    # Gets the info on what columns should be created
    db_dict_ed_str_list = get_schema.get_schema('EditableStringList')
    db_dict_ed_str_el= get_schema.get_schema('EditableStringListElements')

    # Creates writer objects for EditableSynonymsList and Synonym
    load_files_ed_str_list_dict = create_load_files_dict(db_dict_ed_str_list, load_dir)
    editable_string_writer = load_files_ed_str_list_dict['EditableStringList']['writer']

    load_ed_el_syn_dict = create_load_files_dict(db_dict_ed_str_el, load_dir)
    element_writer = load_ed_el_syn_dict['EditableStringListElements']['writer']

    # create a list of synonyms for each graph id
    strings_dict = {}
    for index, row in df.iterrows():
        graph_id = row['graph_id']
        if graph_id in strings_dict:
            strings = strings_dict[graph_id]
            if type(row[element]) is list:
                for syn in row[element]:
                    strings.append(syn)
            else:
                strings.append( row[element])
            strings_dict[graph_id] = strings
        else:
            strings = []
            if type(row[element]) is list:
                for syn in row[element]:
                    strings.append(syn)
            else:
                strings.append( row[element])
            strings_dict[graph_id] = strings

    esl_dict = {}
    counter = 0
    for entry in strings_dict:
        entries = strings_dict[entry]

        #EditableSynonymsList_graph_id:
        esl  = id_class.assign_id().replace('es_', 'esl_')

        esl_dict[entry] = esl
        graph_id = element + '_ ' + entry

        # Write editable synonyms list  csv file entry
        write_editable_string_list(graph_id, editable_string_writer, loader_id, esl)

        pipe_strings = '|'.join(entries)

            # Write synonym  csv file entry
        write_editable_list_elements( element_writer, pipe_strings, esl)
    return esl_dict
def assign_editable_boolean(df, editable_statement_list, loader_id, load_dir,
                            table_name, id_class):

    # Gets the info on what columns should be created
    db_dict = get_schema.get_schema('EditableBoolean')

    # Creates writer object
    load_files_dict = create_load_files_dict(db_dict, load_dir)
    editable_statement_writer = load_files_dict['EditableBoolean']['writer']

    column_list = list(df)
    data = df.T.to_dict().values()
    # Go over data
    for input in data:
        for entry in column_list:
            #  Check if  the column value contains editable statement
            if entry in editable_statement_list:
                graph_id = input['graph_id']
                field = entry
                # Create field in editable statement table
                field = table_name + '_' + field.capitalize() + '_' + graph_id
                statement = str(input[entry])
                if not input[entry]:
                    statement = "False"
                elif pandas.isnull(statement):
                    statement = "False"
                # Assign unique value to editable statement and write it
                es_des_id = write_editable_statement(
                    field, editable_statement_writer, loader_id, statement,
                    id_class)
                # Put this value in the original table
                input[entry] = es_des_id
    # Create new dataframe with editable statement ids
    df2 = pandas.DataFrame(data)
    return df2
Beispiel #6
0
def main(load_directory, loader_id, id_class):
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    files = ['../data/GO_diseases/diseases_pg1.json',
             '../data/GO_diseases/diseases_pg2.json',
             '../data/GO_diseases/diseases_pg3.json',
             '../data/GO_diseases/diseases_pg4.json',
             '../data/GO_diseases/diseases_pg5.json',
             '../data/GO_diseases/diseases_pg6.json']

    #Create dataframe
    df_with_none=combine_files(files)
    df = df_with_none.fillna("")

    #Parse dataframes
    go_disease_df= parse_go_main(df, loader_id, load_directory, id_class)
    go_parents_df = parse_go_parents(df)
    go_parents_df = combine_parents_and_children(go_parents_df, 'parent')
    go_children_df = parse_go_children(df)
    go_children_df = combine_parents_and_children(go_children_df, 'child')
    go_xrefs = parse_go_refs(go_disease_df)
    xrefs_editable_dict = create_EditableXrefsList.assign_editable_xrefs_lists(go_xrefs, loader_id, load_directory,
                                                                          id_class)
    df_xrefs_editable = add_dict(go_xrefs, xrefs_editable_dict, 'xrefs')

    syn_editable_dict = create_EditableStringList.assign_editable_lists(df_xrefs_editable, loader_id, load_directory,
                                                                     id_class, 'synonyms')

    df_syn_editable= add_dict(df_xrefs_editable, syn_editable_dict, 'synonyms')

    df_syn_editable.to_csv(load_directory + 'go_diseases.csv', index=False)
    go_parents_df.to_csv(load_directory + 'go_parents.csv', index=False)
    go_children_df.to_csv(load_directory + 'go_children.csv', index=False)


    # Write sql tables
    db_dict = get_schema.get_schema('go_diseases')
    db_parents_dict = get_schema.get_schema('go_parents')
    db_children_dict = get_schema.get_schema('go_children')


    write_sql.write_sql(db_dict, 'go_diseases')
    write_sql.write_sql(db_parents_dict, 'go_parents')
    write_sql.write_sql(db_children_dict, 'go_children')

    print('go diseases are extracted')
Beispiel #7
0
def main(load_directory, loader_id, id_class):
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    file = ('../data/onctoree.json')

    #Convert json to dataframe
    df = pandas.read_json(file)

    #Parse dataframes
    oncotree_df = parse_oncotree_main(df, load_directory, loader_id, id_class)
    xrefs_editable = create_EditableXrefsList.assign_editable_xrefs_lists(
        oncotree_df, loader_id, load_directory, id_class)
    oncotree_disease_with_xrefs = add_column_to_dataframe(
        oncotree_df, xrefs_editable, 'xrefs')
    oncotree_df = oncotree_disease_with_xrefs[[
        'code', 'name', 'mainType', 'tissue', 'xrefs', 'graph_id'
    ]]

    path = load_directory + 'oncotree_diseases.csv'
    write_load_files.main(oncotree_df, path)

    oncotree_df_parent = parse_oncotree_parents(df)
    oncotree_df_parent = combine_parents_and_children(oncotree_df_parent,
                                                      'parent')
    path_parents = load_directory + 'oncotree_parents.csv'
    write_load_files.main(oncotree_df_parent, path_parents)

    oncotree_df_children = reverse_parent_child(oncotree_df_parent)
    path_children = load_directory + 'oncotree_children.csv'
    write_load_files.main(oncotree_df_children, path_children)

    # Write sql tables
    db_dict = get_schema.get_schema('oncotree_diseases')
    db_dict_parents = get_schema.get_schema('oncotree_parents')
    db_dict_children = get_schema.get_schema('oncotree_children')

    write_sql.write_sql(db_dict, 'oncotree_diseases')
    write_sql.write_sql(db_dict_parents, 'oncotree_parents')
    write_sql.write_sql(db_dict_children, 'oncotree_children')

    print(datetime.datetime.now().strftime("%H:%M:%S"))
def main(load_directory, loader_id, id_class):
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    path = '../data/tblOS_GLOBAL_GLOBAL_Ref_OmniDiseases.csv'

    #Create dataframe
    df = extract_file(path)
    df2 = pandas.DataFrame([['OmniDx_blank', 'None', 'None', 'Other']],
                           columns=[
                               'OmniDiseaseID', 'OmniDisease',
                               'OmniDiseaseName', 'OmniDiseaseType'
                           ])
    df = pandas.concat([df, df2], ignore_index=True)
    omni_disease_df = parse_omnidisease(df, load_directory, loader_id,
                                        id_class)
    path = load_directory + 'omni_diseases.csv'
    write_load_files.main(omni_disease_df, path)

    db_dict = get_schema.get_schema('omni_diseases')
    write_sql.write_sql(db_dict, 'omni_diseases')
def main(load_directory, loader_id, id_class):
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    my_db = None
    my_cursor = None

    print('WARNING: skipping DiseaseOntology download during development')

    #download_do()
    doid_dict, doid_child_dict = parse_do()
    #db_dict = get_schema_original()
    # creates a dataframe from doid_dict
    do_df = create_dataframe(doid_dict)

    # replaces values with editable statements
    df_editable = create_editable_statement.assign_editable_statement(
        do_df, editable_statement_list, loader_id, load_directory,
        do_table_name, id_class)
    # creates a dictionary with ids for exact synonyms
    exact_dict = create_EditableStringList.assign_editable_lists(
        df_editable, loader_id, load_directory, id_class, 'exact_synonyms')
    # replaces exact synonyms with assigned ids in a dataframe
    df_exact = add_dict(df_editable, exact_dict, 'exact_synonyms')

    # creates a dictionary with ids for related synonyms
    related_dict = create_EditableStringList.assign_editable_lists(
        df_editable, loader_id, load_directory, id_class, 'related_synonyms')
    # replaces related synonyms with assigned ids in a dataframe
    df_related = add_dict(df_exact, related_dict, 'related_synonyms')

    # creates a dictionary with ids for narrow synonyms
    narrow_dict = create_EditableStringList.assign_editable_lists(
        df_related, loader_id, load_directory, id_class, 'narrow_synonyms')
    # replaces narrow synonyms with assigned ids in a dataframe
    df_narrow = add_dict(df_related, narrow_dict, 'narrow_synonyms')

    # creates a dictionary with ids for subsets
    subset_dict = create_EditableStringList.assign_editable_lists(
        df_narrow, loader_id, load_directory, id_class, 'subset')
    # replaces subsets with assigned ids in a dataframe
    df_subset = add_dict(df_narrow, subset_dict, 'subset')

    # creates a dictionary with ids for xrefs
    xref_dict = create_EditableXrefsList.assign_editable_xrefs_lists(
        df_subset, loader_id, load_directory, id_class)
    # replaces xrefs with assigned ids in a dataframe
    df_xref = add_dict(df_subset, xref_dict, 'xrefs')

    del df_xref['reference']
    df_xref.to_csv(load_directory + 'do_diseases.csv', index=False)

    # creates dataframe of children
    children_df = add_children(df_xref, doid_child_dict)
    children_df.to_csv(load_directory + 'do_children.csv', index=False)

    # creates dataframe of parents
    parents_df = add_parents(df_xref, doid_child_dict)
    parents_df.to_csv(load_directory + 'do_parents.csv', index=False)

    create_references.main(do_df)

    db_dict = get_schema.get_schema('do_diseases')
    write_sql.write_sql(db_dict, 'do_diseases')

    db_dict = get_schema.get_schema('do_parents')
    write_sql.write_sql(db_dict, 'do_parents')

    db_dict = get_schema.get_schema('do_children')
    write_sql.write_sql(db_dict, 'do_children')

    print('do diseases are extracted')