Esempio n. 1
0
def load_gaf(filename, collection, start=0):

    print("Loading: %s" % filename)

    collection.create_index("DB_Object_ID")
    collection.create_index("DB")
    collection.create_index("GO_ID")
    collection.create_index("Evidence")
    collection.create_index("Aspect")
    collection.create_index("Date")

    n = count_lines(filename, sep=bytes('\n', 'ascii'))
    pbar = tqdm(range(n), desc="annotations loaded")

    with open(filename, 'r') as handler:

        goa_iterator = gafiterator(handler)

        for i, data in enumerate(goa_iterator):

            if i < start \
                    or (args.noiea and data['Evidence'] == 'IEA') \
                    or (args.exp and data['Evidence'] not in exp_codes):
                pbar.update(1)
                continue

            date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date()

            json = {
                "DB_Object_ID": data['DB_Object_ID'],
                "DB_Object_Symbol": data['DB_Object_Symbol'],
                "With": data['With'],
                "Assigned_By": data['Assigned_By'],
                "Annotation_Extension": data['Annotation_Extension'],
                "Gene_Product_Form_ID": data['Gene_Product_Form_ID'],
                "DB:Reference": data['DB:Reference'],
                "GO_ID": data['GO_ID'],
                "Qualifier": data['Qualifier'],
                "Date": datetime.datetime.fromordinal(date.toordinal()),
                "DB": data['DB'],
                "created_at": datetime.datetime.utcnow(),
                "DB_Object_Name": data['DB_Object_Name'],
                "DB_Object_Type": data['DB_Object_Type'],
                "Evidence": data['Evidence'],
                "Taxon_ID": data['Taxon_ID'],
                "Aspect": data['Aspect']
            }

            collection.update_one({"_id": i}, {'$set': json}, upsert=True)

            pbar.update(1)

    pbar.close()

    print("\nFinished!")
Esempio n. 2
0
def go_annotations(filepath: str):
    """Handles GO annotation file io.

    # Arguments
        filepath: str, filepath of GO annotation file

    # Returns
        gafiterator

    # Raises
        FileNotFoundError
    """
    try:
        with open(filepath) as f:
            iterator = gafiterator(f)
            yield iterator
    except FileNotFoundError as err:
        raise
Esempio n. 3
0
def parse_gaf(fn):
    if os.path.splitext(fn)[1].lower()[-3:] == '.gz':
        open_func = gzip.open
    else:
        open_func = open
    with open_func(fn, 'rb') as f:
        it = gafiterator(f)
        gene_by_go_term = {}
        go_term_by_gene = {}
        for rec in it:
            gs = rec['DB_Object_Symbol']
            go_id = rec['GO_ID']
            if go_id not in gene_by_go_term:
                gene_by_go_term[go_id] = []
            if gs not in go_term_by_gene:
                go_term_by_gene[gs] = []
            gene_by_go_term[go_id].append(gs)
            go_term_by_gene[gs].append(go_id)
    return gene_by_go_term, go_term_by_gene
Esempio n. 4
0
def getAnnotations():
    # uses database from uniprot to count the number of annotations each GO term has, and returns a dictionary of GO IDs and their number of annotations
    annotDict = {}
    for goID in G.nodes():
        annotDict[goID] = 0
    filename = 'data/goa_uniprot_all.gaf.gz'
    with gzip.open(filename, 'rt') as fp:
        count = 0
        for annotation in gafiterator(fp):
            count += 1
            percentage = float(count) * 100 / 424606000
            if percentage > 5:
                print('hi')
                count = 0
            goID = unicode(annotation['GO_ID'])
            try:
                annotDict[goID] += 1
            except KeyError:
                print(goID)
    return annotDict

newEnsemblIDs = []
for i in EnsemblIDs:
    if i in UP_ID['EnsemblID']:
        newEnsemblIDs.append(i)

C_int_UP = list(map(lambda x: ensembl_to_up[x], newEnsemblIDs))


#Enrichment Analysis
go = obo.GODag('/disks/strw13/DBDM/A4_2/go-basic.obo')

with gzip.open('goa_human.gaf.gz', 'rt') as fp:
    funcs = {}
    for entry in gafiterator(fp):
        uniprot_id = entry.pop('DB_Object_ID')
        funcs[uniprot_id] = entry

pop = funcs.keys()
assoc = {}

for x in funcs:
    if x not in assoc:
        assoc[x] = set()
    assoc[x].add(str(funcs[x]['GO_ID']))
    
dictionary = {x: funcs[x]
               for x in funcs 
               if x in C_int_UP}
    #handle = "goa_cow.gaf"
    #handle = "goa_dog.gaf"
    #handle = "goa_pig.gaf"
    #handle = "goa_fly.gaf"
    #handle = "goa_worm.gaf"
    #handle = "goa_yeast.gaf"
    #handle = open(handle)

    genes = defaultdict(Gene)

    reg_targets = defaultdict(int)

    annext_cnt = 0

    handle = open(gaffile)
    for rec in gafiterator(handle):

        if rec["Annotation_Extension"] and rec["DB"] == "UniProtKB":
            # print rec["Annotation_Extension"]
            protid = rec["DB_Object_ID"]

            genes[protid].add_annotation(rec)
            annext_cnt += 1

    print("%d annotations has extensions" % annext_cnt)

    dbs = {"UniProtKB": []}

#%% Genes has_regulation_target
for gene_id, gene in genes.items():
    #    print("gene", gene_id)
Esempio n. 7
0
import gzip

g = GODag(obo_file='data/go.obo')


"""
'DB': the protein database;
'DB_Object_ID': protein ID;
'Qualifier': annotation qualifier (such as NOT);
'GO_ID': GO term;
'Evidence': evidence code
"""
# filename = <LOCATION OF GAF FILE>
filename = 'data/goa_human.gaf.gz'
with gzip.open(filename, 'rt') as fp:
    x ={annotation['DB_Object_ID']:annotation['GO_ID'] for annotation in gafiterator(fp)}


"""
To get the GO entry for a particular protein's accession number:
g[x['O43707']]
and use .name etc, to get terms of that GO entry
"""

f = 'output/result.pkl'
with open(f, 'rb') as file:
    df = pickle.load(file)

go_hits = [g[x[y]] if y in x else '' for y in df['uniprot']]
name_hits = [y.name if y!='' else '' for y in go_hits]