Exemple #1
0
def build_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[UBERON, GO])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        newpairs = remove_overused_xrefs(pairs)
        glom(dicts, newpairs, unique_prefixes=[UBERON, GO])
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),
                                   types)
    for biotype, sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets, f'{baretype}.txt', biotype, {})
Exemple #2
0
def build_protein_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = [UNIPROTKB, PR]
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=uniques)
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        glom(dicts, pairs, unique_prefixes=uniques)
    gene_sets = set([frozenset(x) for x in dicts.values()])
    #Try to preserve some memory here.
    dicts.clear()
    baretype = PROTEIN.split(':')[-1]
    write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {})
Exemple #3
0
def build_compendia(identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = []
    for ifile in identifiers:
        print('loading', ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=uniques)
        types.update(new_types)
    genefam_sets = set([frozenset(x) for x in dicts.values()])
    baretype = GENE_FAMILY.split(':')[-1]
    write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {})
def build_compendium(concordances, identifiers, mondoclose, badxrefs):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers,new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[MONDO, HP])
        types.update(new_types)
    #Load close Mondos
    with open(mondoclose, 'r') as inf:
        close_mondos = defaultdict(set)
        for line in inf:
            x = tuple(line.strip().split('\t'))
            close_mondos[x[0]].add(x[1])
    #Load and glom concords
    for infile in concordances:
        print(infile)
        pairs = []
        pref = path.basename(infile)
        if pref in badxrefs:
            print('reading bad xrefs',pref)
            bad_pairs = read_badxrefs(badxrefs[pref])
        else:
            print('no bad pairs', pref)
            bad_pairs = set()
        with open(infile,'r') as inf:
            for line in inf:
                stuff = line.strip().split('\t')
                x = tuple( [stuff[0], stuff[2]] )
                if len(x) != 2:
                    print(x)
                    exit()
                if x not in bad_pairs:
                    pairs.append( x )
        if pref in ['MONDO','HP','EFO']:
            newpairs = remove_overused_xrefs(pairs)
        else:
            newpairs = pairs
        glom(dicts, newpairs, unique_prefixes=[MONDO, HP], close={MONDO:close_mondos})
        try:
            print(dicts['OMIM:607644'])
        except:
            print('notyet')
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types)
    for biotype,sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets,f'{baretype}.txt',biotype,{})
Exemple #5
0
def build_gene_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = [NCBIGENE,HGNC,ENSEMBL,OMIM]
    for ifile in identifiers:
        print('loading',ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes= uniques)
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        glom(dicts, pairs, unique_prefixes=uniques)
    gene_sets = set([frozenset(x) for x in dicts.values()])
    baretype = GENE.split(':')[-1]
    write_compendium(gene_sets, f'{baretype}.txt', GENE, {})
Exemple #6
0
def build_compendia(concordances, identifiers,unichem_partial):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = read_partial_unichem(unichem_partial)
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers,new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[INCHIKEY])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading',infile)
        pairs = []
        with open(infile,'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append( set([x[0], x[2]]))
        newpairs = remove_overused_xrefs(pairs)
        glom(dicts, newpairs, unique_prefixes=[INCHIKEY])
    chem_sets = set([frozenset(x) for x in dicts.values()])
    baretype = CHEMICAL_SUBSTANCE.split(':')[-1]
    write_compendium(chem_sets, f'{baretype}.txt', CHEMICAL_SUBSTANCE, {})
Exemple #7
0
def build_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    #These are concords that cause problems and are being special cased out.  In disease/process we put these in some
    # files, and maybe we should here too?
    #GO:0034227/EC:2.8.1.4 is because that go term is a biological process, but EC is not a valid prefix for that,
    #  leading to a loss of the EC term (and a unified RHEA) on output.
    bad_concords = set(frozenset(['GO:0034227', 'EC:2.8.1.4']))
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[GO])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pair = frozenset([x[0], x[2]])
                if pair not in bad_concords:
                    pairs.append(pair)
        #one kind of error is that GO->Reactome xrefs are freqently more like subclass relations. So
        # GO:0004674 (protein serine/threonine kinase) has over 400 Reactome xrefs
        # remove_overused_xrefs assumes that we want to remove pairs where the second pair is overused
        # but this case it's the first, so we use the bothways optoin
        newpairs = remove_overused_xrefs(pairs, bothways=True)
        glom(dicts, newpairs, unique_prefixes=[GO])
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),
                                   types)
    for biotype, sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets, f'{baretype}.txt', biotype, {})
def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs):
    #print('disease/phenotype')
    #print('get and write hp sets')
    #bad_mappings = read_bad_hp_mappings(badhpos)
    #more_bad_mappings = read_badxrefs(badhpoxrefs)
    #for h,m in more_bad_mappings.items():
    #    bad_mappings[h].update(m)
    #hpo_sets,labels = build_sets('HP:0000118', ignore_list = ['ICD','NCIT'], bad_mappings = bad_mappings)
    #print('filter')
    hpo_sets = filter_out_non_unique_ids(hpo_sets)
    #print('ok')
    #dump_sets(hpo_sets,'hpo_sets.txt')
    print('get and write mondo sets')
    #MONDO has disease, and its sister disease susceptibility.  I'm putting both in disease.  Biolink q
    #But! this is a problem right now because there are some things that go in both, and they are getting filtered out
    bad_mondo_mappings = read_badxrefs('mondo')
    mondo_sets_1,labels_1 = build_exact_sets('MONDO:0000001',bad_mondo_mappings)
    mondo_sets_2,labels_2 = build_exact_sets('MONDO:0042489',bad_mondo_mappings)
    mondo_close = get_close_matches('MONDO:0000001')
    mondo_close2 = get_close_matches('MONDO:0042489')
    for k,v in mondo_close2.items():
        mondo_close[k] = v
    dump_sets(mondo_sets_1,'mondo1.txt')
    dump_sets(mondo_sets_2,'mondo2.txt')
    labels.update(labels_1)
    labels.update(labels_2)
    #if we just add these together, then any mondo in both lists will get filtered out in the next step.
    #so we need to put them into a set.  You can't put sets directly into a set, you have to freeze them first
    mondo_sets = combine_id_sets(mondo_sets_1,mondo_sets_2)
    mondo_sets = filter_out_non_unique_ids(mondo_sets)
    dump_sets(mondo_sets,'mondo_sets.txt')
    print('get and write umls sets')
    bad_umls = read_badxrefs('umls')
    meddra_umls,secondary_meddra_umls = read_meddra(bad_umls)
    meddra_umls = filter_umls(meddra_umls,mondo_sets+hpo_sets,'filtered.txt')
    secondary_meddra_umls = filter_umls(secondary_meddra_umls,mondo_sets+hpo_sets,'filtered_secondary.txt')
    #Now, if we just use all the secondary links, things get too agglommed.
    # So instead, lets filter these again.
    meddra_umls += filter_secondaries(secondary_meddra_umls,'double_filter.txt')
    dump_sets(meddra_umls,'meddra_umls_sets.txt')
    dicts = {}
    #EFO has 3 parts that we want here:
    # Disease
    efo_sets_1,l = build_exact_sets('EFO:0000408')
    labels.update(l)
    #phenotype
    efo_sets_2,l = build_exact_sets('EFO:0000651')
    labels.update(l)
    #measurement
    efo_sets_3,l = build_exact_sets('EFO:0001444')
    labels.update(l)
    efo_sets_a = combine_id_sets(efo_sets_1,efo_sets_2)
    efo_sets = combine_id_sets(efo_sets_a, efo_sets_3)
    efo_sets = filter_out_non_unique_ids(efo_sets)
    dump_sets(efo_sets,'efo_sets.txt')
    print('put it all together')
    print('mondo')
    glom(dicts,mondo_sets,unique_prefixes=['MONDO'])
    dump_dicts(dicts,'mondo_dicts.txt')
    print('hpo')
    glom(dicts,hpo_sets,unique_prefixes=['MONDO'],pref='HP')
    dump_dicts(dicts,'mondo_hpo_dicts.txt')
    print('umls')
    glom(dicts,meddra_umls,unique_prefixes=['MONDO','HP'],pref='UMLS',close={'MONDO':mondo_close})
    dump_dicts(dicts,'mondo_hpo_meddra_dicts.txt')
    print('efo')
    glom(dicts,efo_sets,unique_prefixes=['MONDO','HP'],pref='EFO')
    dump_dicts(dicts,'mondo_hpo_meddra_efo_dicts.txt')
    print('dump it')
    fs = set([frozenset(x) for x in dicts.values()])
    diseases,phenotypes = create_typed_sets(fs)
    write_compendium(diseases,'disease.txt','biolink:Disease',labels)
    write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels)