Esempio n. 1
0
def build_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[UBERON, GO])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        newpairs = remove_overused_xrefs(pairs)
        glom(dicts, newpairs, unique_prefixes=[UBERON, GO])
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),
                                   types)
    for biotype, sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets, f'{baretype}.txt', biotype, {})
Esempio n. 2
0
def build_protein_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = [UNIPROTKB, PR]
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=uniques)
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        glom(dicts, pairs, unique_prefixes=uniques)
    gene_sets = set([frozenset(x) for x in dicts.values()])
    #Try to preserve some memory here.
    dicts.clear()
    baretype = PROTEIN.split(':')[-1]
    write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {})
Esempio n. 3
0
def build_compendia(identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = []
    for ifile in identifiers:
        print('loading', ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=uniques)
        types.update(new_types)
    genefam_sets = set([frozenset(x) for x in dicts.values()])
    baretype = GENE_FAMILY.split(':')[-1]
    write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {})
Esempio n. 4
0
def build_compendium(concordances, identifiers, mondoclose, badxrefs):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers,new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[MONDO, HP])
        types.update(new_types)
    #Load close Mondos
    with open(mondoclose, 'r') as inf:
        close_mondos = defaultdict(set)
        for line in inf:
            x = tuple(line.strip().split('\t'))
            close_mondos[x[0]].add(x[1])
    #Load and glom concords
    for infile in concordances:
        print(infile)
        pairs = []
        pref = path.basename(infile)
        if pref in badxrefs:
            print('reading bad xrefs',pref)
            bad_pairs = read_badxrefs(badxrefs[pref])
        else:
            print('no bad pairs', pref)
            bad_pairs = set()
        with open(infile,'r') as inf:
            for line in inf:
                stuff = line.strip().split('\t')
                x = tuple( [stuff[0], stuff[2]] )
                if len(x) != 2:
                    print(x)
                    exit()
                if x not in bad_pairs:
                    pairs.append( x )
        if pref in ['MONDO','HP','EFO']:
            newpairs = remove_overused_xrefs(pairs)
        else:
            newpairs = pairs
        glom(dicts, newpairs, unique_prefixes=[MONDO, HP], close={MONDO:close_mondos})
        try:
            print(dicts['OMIM:607644'])
        except:
            print('notyet')
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types)
    for biotype,sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets,f'{baretype}.txt',biotype,{})
Esempio n. 5
0
def build_gene_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = {}
    types = {}
    uniques = [NCBIGENE,HGNC,ENSEMBL,OMIM]
    for ifile in identifiers:
        print('loading',ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes= uniques)
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append(set([x[0], x[2]]))
        glom(dicts, pairs, unique_prefixes=uniques)
    gene_sets = set([frozenset(x) for x in dicts.values()])
    baretype = GENE.split(':')[-1]
    write_compendium(gene_sets, f'{baretype}.txt', GENE, {})
Esempio n. 6
0
def build_compendia(concordances, identifiers,unichem_partial):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    dicts = read_partial_unichem(unichem_partial)
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers,new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[INCHIKEY])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading',infile)
        pairs = []
        with open(infile,'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pairs.append( set([x[0], x[2]]))
        newpairs = remove_overused_xrefs(pairs)
        glom(dicts, newpairs, unique_prefixes=[INCHIKEY])
    chem_sets = set([frozenset(x) for x in dicts.values()])
    baretype = CHEMICAL_SUBSTANCE.split(':')[-1]
    write_compendium(chem_sets, f'{baretype}.txt', CHEMICAL_SUBSTANCE, {})
Esempio n. 7
0
def build_compendia(concordances, identifiers):
    """:concordances: a list of files from which to read relationships
       :identifiers: a list of files from which to read identifiers and optional categories"""
    #These are concords that cause problems and are being special cased out.  In disease/process we put these in some
    # files, and maybe we should here too?
    #GO:0034227/EC:2.8.1.4 is because that go term is a biological process, but EC is not a valid prefix for that,
    #  leading to a loss of the EC term (and a unified RHEA) on output.
    bad_concords = set(frozenset(['GO:0034227', 'EC:2.8.1.4']))
    dicts = {}
    types = {}
    for ifile in identifiers:
        print(ifile)
        new_identifiers, new_types = read_identifier_file(ifile)
        glom(dicts, new_identifiers, unique_prefixes=[GO])
        types.update(new_types)
    for infile in concordances:
        print(infile)
        print('loading', infile)
        pairs = []
        with open(infile, 'r') as inf:
            for line in inf:
                x = line.strip().split('\t')
                pair = frozenset([x[0], x[2]])
                if pair not in bad_concords:
                    pairs.append(pair)
        #one kind of error is that GO->Reactome xrefs are freqently more like subclass relations. So
        # GO:0004674 (protein serine/threonine kinase) has over 400 Reactome xrefs
        # remove_overused_xrefs assumes that we want to remove pairs where the second pair is overused
        # but this case it's the first, so we use the bothways optoin
        newpairs = remove_overused_xrefs(pairs, bothways=True)
        glom(dicts, newpairs, unique_prefixes=[GO])
    typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),
                                   types)
    for biotype, sets in typed_sets.items():
        baretype = biotype.split(':')[-1]
        write_compendium(sets, f'{baretype}.txt', biotype, {})