Ejemplo n.º 1
0
def filter_in_IEA(handle):
    outhandle = open(handle.name + ".IEA", "w")
    outhandle.write('!gaf-version: 2.0\n')
    for inrec in upg.gafiterator(handle):
        if inrec['Evidence'] == 'IEA':
            upg.writerec(inrec, outhandle)
    outhandle.close()
Ejemplo n.º 2
0
def extract_gaf(rec, outfile, GAFFIELDS, record, sp_id, taxon):

    t = ()

    if sp_id.has_key(rec['DB_Object_ID']):
        GOAParser.writerec(rec, outfile, GAFFIELDS)

        if len(GAFFIELDS) == 15:
            t = (rec['DB'], rec['DB_Object_ID'], rec['DB_Object_Symbol'],
                 ('|'.join(rec['Qualifier'])), rec['GO_ID'],
                 ('|'.join(rec['DB:Reference'])), rec['Evidence'],
                 ('|'.join(rec['With'])), rec['Aspect'], rec['DB_Object_Name'],
                 ('|'.join(rec['Synonym'])), rec['DB_Object_Type'],
                 ('|'.join(rec['Taxon_ID'])), rec['Date'], rec['Assigned_By'])

        elif len(GAFFIELDS) == 17:
            t = (rec['DB'], rec['DB_Object_ID'], rec['DB_Object_Symbol'],
                 ('|'.join(rec['Qualifier'])), rec['GO_ID'],
                 ('|'.join(rec['DB:Reference'])), rec['Evidence'],
                 ('|'.join(rec['With'])), rec['Aspect'],
                 ('|'.join(rec['DB_Object_Name'])), ('|'.join(rec['Synonym'])),
                 rec['DB_Object_Type'], ('|'.join(rec['Taxon_ID'])),
                 rec['Date'], rec['Assigned_By'], rec['Annotation_Extension'],
                 rec['Gene_Product_Form_ID'])

        record.append(t)

    return record
Ejemplo n.º 3
0
def filter_in_experimental(handle):
    outhandle = open(handle.name + ".exp_evidence", "w")
    outhandle.write('!gaf-version: 2.0\n')
    for inrec in upg.gafiterator(handle):
        if upg.record_has(inrec, {'Evidence': GO_EXP_EC}):
            upg.writerec(inrec, outhandle)
    outhandle.close()
Ejemplo n.º 4
0
def all_hasnt_experimental(handle):
    outhandle = open(handle.name + ".noexp", "w")
    outhandle.write('!gaf-version: 2.0\n')

    for protrec in upg.gafbyproteiniterator(handle):
        if not has_experimental(protrec):
            for outrec in protrec:
                upg.writerec(outrec, outhandle)
    outhandle.close()
Ejemplo n.º 5
0
def all_exclusive_IEA(handle):
    outhandle = open(handle.name + ".exclusive_IEA", "w")
    outhandle.write('!gaf-version: 2.0\n')

    for protrec in upg.gafbyproteiniterator(handle):
        if exclusive_IEA(protrec):
            for outrec in protrec:
                upg.writerec(outrec, outhandle)
    outhandle.close()
Ejemplo n.º 6
0
def extract_taxon(handle, in_taxid):
    """
    Create a GAF file from a single taxon
    """
    header = "!gaf-version: 2.0\n"
    if isinstance(in_taxid, int):
        taxid = str(in_taxid)
    taxid = in_taxid.strip()
    outfile = open("%s.taxon.%s" % (handle.name, taxid), 'w')
    outfile.write(header)
    for inrec in upg.gafiterator(handle):
        if inrec['Taxon_ID'][0].split(':')[1] == taxid:
            upg.writerec(inrec, outfile)
    outfile.close()
def build_clusters(species):
    """
        Build GO Clusters from a species gene association file. The cluster contains a representative GO term, 
        the proteins annotated to this term and all the papers that those proteins appear in.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    go_clusters = {}
    pmid_go = {}
    pmid_prot = {}
    unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species))
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                if inrec['GO_ID'] not in go_clusters:
                    go_clusters[inrec['GO_ID']] = {'proteins':Set([inrec['DB_Object_ID']]), 'papers':Set([pmid])}
                else:
                    go_clusters[inrec['GO_ID']]['proteins'].add(inrec['DB_Object_ID'])
                    go_clusters[inrec['GO_ID']]['papers'].add(pmid)
                if pmid not in pmid_go:
                    pmid_go[pmid] = Set([inrec['GO_ID']])
                else:
                    pmid_go[pmid].add(inrec['GO_ID'])
                if pmid not in pmid_prot:
                    pmid_prot[pmid] = Set([inrec['DB_Object_ID']])
                else:
                    pmid_prot[pmid].add(inrec['DB_Object_ID'])
    
    pickle_data(go_clusters, os.path.join(CURR_PATH, "Pickled_Data/go_clusters_"+species))
    pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/pmid_go_"+species))
    pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/pmid_prot_"+species))
def pmids_from_gaf(gaf_file):
    """
        Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    
    pmid_go = {}
    unigoa_file = open(gaf_file)
    pmids = {}
    pmid_prot = {}
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                pmids[pmid] = None
                if pmid not in pmid_go:
                    pmid_go[pmid] = [inrec['GO_ID']]
                elif inrec['GO_ID'] not in pmid_go[pmid]:
                    pmid_go[pmid].append(inrec['GO_ID'])
                if pmid not in pmid_prot:
                    pmid_prot[pmid] = [inrec['DB_Object_ID']]
                elif inrec['DB_Object_ID'] not in pmid_prot[pmid]:
                    pmid_prot[pmid].append(inrec['DB_Object_ID'])
        
    return list(pmids.keys()), pmid_go, pmid_prot
Ejemplo n.º 9
0
def get_ebi(uri):
    '''
    Fetches GOA file for a species from UniProt using Biopython
    Retrurns annotations 
    '''
    data_folder = os.getcwd() + '/data'
    fn = uri.split('/')[-1]
    # Check if the file exists already
    gaf = os.path.join(data_folder, fn)
    if (not os.path.isfile(gaf)):
        # Login to FTP server
        ebi_ftp = FTP('ftp.ebi.ac.uk')
        ebi_ftp.login()  # Logs in anonymously

        # Download
        with open(gaf, 'wb') as fp:
            ebi_ftp.retrbinary(f'RETR {uri}', fp.write)

        # Logout from FTP server
        ebi_ftp.quit()
    # File is a gunzip file, so we need to open it in this way
    with gzip.open(gaf, 'rt') as gaf_fp:
        funcs = {}  # Initialise the dictionary of functions

        # Iterate on each function using Bio.UniProt.GOA library.
        for entry in GOA.gafiterator(gaf_fp):
            uniprot_id = entry.pop('DB_Object_ID')
            funcs[uniprot_id] = entry
    return funcs
Ejemplo n.º 10
0
def extract_taxa(handle, taxalist):
    """
    Create a GAF file from multiple taxa
    taxalist is a list of strings of taxid. Don't use list of int
    """
    outfiles = {}
    header = "!gaf-version: 2.0\n"
    for taxid in taxalist:
        outfiles[taxid] = open("%s.taxon.%s" % (handle.name, taxid), 'w')
        outfiles[taxid].write(header)
    for inrec in upg.gafiterator(handle):
        cur_taxid = inrec['Taxon_ID'][0].split(':')[1]
        if cur_taxid in taxalist:
            upg.writerec(inrec, outfiles[cur_taxid])
    for i in outfiles:
        outfiles[i].close()
Ejemplo n.º 11
0
    def go_enrichment_study(self):
        if self._go_enrichment_study is None:

            # Load the human annotations
            c = 0
            with gzip.open('../DownloadedResources/goa_human.gaf.gz',
                           'rt') as gaf:
                funcs = {}
                for entry in GOA.gafiterator(gaf):
                    c += 1
                    uniprot_id = entry.pop('DB_Object_Symbol')
                    funcs[uniprot_id] = entry
            # Our population is the set of genes we are analysing
            population = self.gene_symbols()
            print("We have %d genes in our population" % len(population))
            # Build associations from functional annotations we got from the gaf file
            associations = {}
            for x in funcs:
                if x not in associations:
                    associations[x] = set()
                associations[x].add(str(funcs[x]['GO_ID']))
            self._go_enrichment_study = \
                GOEnrichmentStudy(population, associations, self._gene_ontology,
                                  propagate_counts=True,
                                  alpha=0.01,
                                  methods=[self.method])
        return self._go_enrichment_study
Ejemplo n.º 12
0
    def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'):
        # Load the Gene Ontology
        n_comps = metagene_matrix.shape[1]

        self.download_and_cache_resources(
        )  # Download ontology and annotations, if necessary
        gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo')

        # Load the human annotations
        c = 0
        with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf:
            funcs = {}
            for entry in GOA.gafiterator(gaf):
                c += 1
                uniprot_id = entry.pop('DB_Object_Symbol')
                funcs[uniprot_id] = entry

        # Our population is the set of genes we are analysing

        population = self.gene_symbols()
        print("We have %d genes in our population" % len(population))

        # Build associations from functional annotations we got from the gaf file
        associations = {}
        for x in funcs:
            if x not in associations:
                associations[x] = set()
            associations[x].add(str(funcs[x]['GO_ID']))

        gea = GOEnrichmentStudy(population,
                                associations,
                                gene_ontology,
                                propagate_counts=True,
                                alpha=0.05,
                                methods=[method])
        gea_results_by_component = {}
        rankings = self.ranked_genes_by_component(metagene_matrix)
        for ci in range(n_comps):
            study_genes = rankings[ci]
            print('\nComp. %d: %s...' % (ci, str(study_genes[:10])))
            gea_results_by_component[ci] = gea.run_study(study_genes)

        # Get results into a dataframe per component.  Easiest way is to use routine to
        # write a .tsv file, then read back and filter

        gea_results_df_by_component = []
        for ci in range(n_comps):
            ge_df = self._perform_gene_enrichment_analysis_one_component(
                ci, gea_results_by_component, gea)
            if ge_df is not None:
                gea_results_df_by_component += [ge_df]

        # Merge the per-component dataframes into a single one
        gea_all_sig_results_df = pd.DataFrame()
        gea_all_sig_results_df = gea_all_sig_results_df.append(
            gea_results_df_by_component)

        gea_all_sig_results_df.to_csv(self.cache_dir +
                                      '%s_gea_all.tsv' % self.prefix,
                                      sep='\t')
Ejemplo n.º 13
0
 def test_gpi_iterator(self):
     """Test GOA GPI file iterator, gpi-version: 1.1."""
     recs = []
     with open("UniProt/gp_information.goa_yeast.28.gpi") as handle:
         for rec in GOA.gpi_iterator(handle):
             recs.append(rec)
     self.assertEqual(len(recs), 300)
     self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI11FIELDS))
     # Check values of first record
     self.assertEqual(recs[0]["DB_Object_ID"], "A2P2R3")
     self.assertEqual(recs[0]["DB_Object_Symbol"], "YMR084W")
     self.assertEqual(
         recs[0]["DB_Object_Name"],
         [
             "Putative glutamine--fructose"
             "-6-phosphate aminotransferase"
             " [isomerizing]"
         ],
     )
     self.assertEqual(recs[0]["DB_Object_Synonym"],
                      ["YM084_YEAST", "YMR084W"])
     self.assertEqual(recs[0]["DB_Object_Type"], "protein")
     self.assertEqual(recs[0]["Taxon"], "taxon:559292")
     self.assertEqual(recs[0]["Parent_Object_ID"], "")
     self.assertEqual(recs[0]["DB_Xref"], [""])
     self.assertEqual(recs[0]["Gene_Product_Properties"],
                      ["db_subset=Swiss-Prot"])
Ejemplo n.º 14
0
def has_experimental(goa_reclist):
    retval = False
    for rec in goa_reclist:
        if upg.record_has(rec, {'Evidence': GO_EXP_EC}):
            retval = True
            break
    return retval
def build_clusters(species):
    """
        Build GO Clusters from a species gene association file. The cluster contains a representative GO term, 
        the proteins annotated to this term and all the papers that those proteins appear in.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    
    pmid_go_mf = OrderedDict()
    pmid_go_bp = OrderedDict()
    pmid_go_cc = OrderedDict()
    pmid_go = OrderedDict()
    
    pmid_prot_mf = OrderedDict()
    pmid_prot_bp = OrderedDict()
    pmid_prot_cc = OrderedDict()
    pmid_prot = OrderedDict()
    
    go_prot_mf = OrderedDict()
    go_prot_bp = OrderedDict()
    go_prot_cc = OrderedDict()
    go_prot = OrderedDict()
    
    
    unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species))
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                if inrec['Aspect'] == 'P':
                    add_to_pmid(pmid_go_bp, pmid, inrec['GO_ID'])
                    add_to_pmid(pmid_prot_bp, pmid, inrec['DB_Object_ID'])
                    add_to_pmid(go_prot_bp, inrec['GO_ID'], inrec['DB_Object_ID'])
                elif inrec['Aspect'] == 'F':
                    add_to_pmid(pmid_go_mf, pmid, inrec['GO_ID'])
                    add_to_pmid(pmid_prot_mf, pmid, inrec['DB_Object_ID'])
                    add_to_pmid(go_prot_mf, inrec['GO_ID'], inrec['DB_Object_ID'])
                elif inrec['Aspect'] == 'C':
                    add_to_pmid(pmid_go_cc, pmid, inrec['GO_ID'])
                    add_to_pmid(pmid_prot_cc, pmid, inrec['DB_Object_ID'])
                    add_to_pmid(go_prot_cc, inrec['GO_ID'], inrec['DB_Object_ID'])
                add_to_pmid(pmid_go, pmid, inrec['GO_ID'])
                add_to_pmid(pmid_prot, pmid, inrec['DB_Object_ID'])
                add_to_pmid(go_prot, inrec['GO_ID'], inrec['DB_Object_ID'])
                
        
    pickle_data(pmid_go_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_mf_"+species))
    pickle_data(pmid_go_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_cc_"+species))
    pickle_data(pmid_go_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_bp_"+species))
    pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_"+species))

    pickle_data(pmid_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_mf_"+species))
    pickle_data(pmid_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_cc_"+species))
    pickle_data(pmid_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_bp_"+species))
    pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_"+species))
    
    pickle_data(go_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_mf_"+species))
    pickle_data(go_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_cc_"+species))
    pickle_data(go_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_bp_"+species))
    pickle_data(go_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_"+species))
def pmids_from_gaf(unigoa_file):
    pmids = {}
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                pmids[pmid] = None
    return list(pmids.keys()) # I enforced the list cast here because the dict_key is not subscriptable
Ejemplo n.º 17
0
 def get_GO_genes_info(self):
     lines = []
     with open(self.gene_ontology_file_path) as file:
         l = GOA.gafiterator(file)
         for line in l:
             lines.append(line)
     go_df = pd.DataFrame(lines)
     return go_df
Ejemplo n.º 18
0
def became_experimental(handle_iea, handle_exp):
    """Identify Electronically annotated proteins that became experimentally
    annotated later
    handle_iea: gaf file with proteins exclusively IEA annotated
    handle_exp: gaf file with proteins experimentally annotated
    Note: files should contain only one ontology (either MFO, BPO or
    CCO) to make sense.
    """
    expdict = {}
    outhandle = open(handle_exp.name + ".became_exp", "w")
    outhandle.write('!gaf-version: 2.0\n')
    # First read experimental into memory
    for exprec in upg.gafbyproteiniterator(handle_exp):
        expdict[exprec[0]['DB_Object_ID']] = exprec
    # Now read in the non-experimental
    for iearec in upg.gafbyproteiniterator(handle_iea):
        prot_id = iearec[0]['DB_Object_ID']
        if prot_id in expdict:
            upg.writebyproteinrec(expdict[prot_id], outhandle)
    outhandle.close()
Ejemplo n.º 19
0
def read_gaf_write_tab(gaf_file, include_mfo, outfile):
    Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])}
    if include_mfo:
        Aspect = {'Aspect':set(['P','F'])}
    else:
        Aspect = {'Aspect':set(['P'])}
    Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])}
    outhandle = open(outfile, 'w')
    ingafhandle = open(gaf_file,'r')
    counter = 0
    for rec in GOA.gafiterator(ingafhandle):
        if GOA.record_has(rec, Aspect):
            if GOA.record_has(rec, Evidence):
                prot = rec['DB_Object_ID']
                go = rec['GO_ID']
                outhandle.write("%s\t%s\n" % (prot, go))
                counter +=1
    ingafhandle.close()
    outhandle.close()
    return(counter)
Ejemplo n.º 20
0
    def test_gaf_iterator(self):
        """Test GOA GAF file iterator."""
        # Test GAF 2.0
        recs = []
        with open("UniProt/goa_yeast.gaf") as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 587)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]["DB"], "UniProtKB")
        self.assertEqual(recs[0]["DB_Object_ID"], "A0A023PXA5")
        self.assertEqual(recs[0]["DB_Object_Symbol"], "YAL019W-A")
        self.assertEqual(recs[0]["Qualifier"], [""])
        self.assertEqual(recs[0]["GO_ID"], "GO:0003674")
        self.assertEqual(recs[0]["DB:Reference"], ["GO_REF:0000015"])
        self.assertEqual(recs[0]["Evidence"], "ND")
        self.assertEqual(recs[0]["With"], [""])

        # Test GAF 2.1, it has the same fields as GAF 2.0
        recs = []
        with open("UniProt/gene_association.goa_yeast.1.gaf") as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 300)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]["DB"], "UniProtKB")
        self.assertEqual(recs[0]["DB_Object_ID"], "P17536")
        self.assertEqual(recs[0]["DB_Object_Symbol"], "TPM1")
        self.assertEqual(recs[0]["Qualifier"], [""])
        self.assertEqual(recs[0]["GO_ID"], "GO:0000001")
        self.assertEqual(recs[0]["DB:Reference"], ["PMID:10652251"])
        self.assertEqual(recs[0]["Evidence"], "TAS")
        self.assertEqual(recs[0]["With"], [""])
Ejemplo n.º 21
0
    def test_gaf_iterator(self):
        """Test GOA GAF file iterator."""
        # Test GAF 2.0
        recs = []
        with open('UniProt/goa_yeast.gaf', 'r') as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 587)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]['DB'], 'UniProtKB')
        self.assertEqual(recs[0]['DB_Object_ID'], 'A0A023PXA5')
        self.assertEqual(recs[0]['DB_Object_Symbol'], 'YAL019W-A')
        self.assertEqual(recs[0]['Qualifier'], [''])
        self.assertEqual(recs[0]['GO_ID'], 'GO:0003674')
        self.assertEqual(recs[0]['DB:Reference'], ['GO_REF:0000015'])
        self.assertEqual(recs[0]['Evidence'], 'ND')
        self.assertEqual(recs[0]['With'], [''])

        # Test GAF 2.1, it has the same fields as GAF 2.0
        recs = []
        with open('UniProt/gene_association.goa_yeast.1.gaf', 'r') as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 300)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]['DB'], 'UniProtKB')
        self.assertEqual(recs[0]['DB_Object_ID'], 'P17536')
        self.assertEqual(recs[0]['DB_Object_Symbol'], 'TPM1')
        self.assertEqual(recs[0]['Qualifier'], [''])
        self.assertEqual(recs[0]['GO_ID'], 'GO:0000001')
        self.assertEqual(recs[0]['DB:Reference'], ['PMID:10652251'])
        self.assertEqual(recs[0]['Evidence'], 'TAS')
        self.assertEqual(recs[0]['With'], [''])
Ejemplo n.º 22
0
def load_gaf(filename,
             start=collection.count({})):  # load GOA in a flat structure

    print("Loading %s" % filename)

    collection.create_index("DB_Object_ID")
    collection.create_index("DB")
    collection.create_index("GO_ID")
    collection.create_index("Evidence")
    collection.create_index("Aspect")
    collection.create_index("Date")
    collection.create_index("DB_Object_Symbol")

    with open(filename, 'r') as handler:

        goa_iterator = GOA.gafiterator(handler)

        for i, data in enumerate(goa_iterator):

            if i % 100 == 0:
                sys.stdout.write("\rProcessed annotations\t%s" % i)

            if i < start or (args.exp and data['Evidence'] not in exp_codes):
                continue

            date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date()

            json = {
                "DB_Object_ID": data['DB_Object_ID'],
                "DB_Object_Symbol": data['DB_Object_Symbol'],
                "With": data['With'],
                "Assigned_By": data['Assigned_By'],
                "Annotation_Extension": data['Annotation_Extension'],
                "Gene_Product_Form_ID": data['Gene_Product_Form_ID'],
                "DB:Reference": data['DB:Reference'],
                "GO_ID": data['GO_ID'],
                "Qualifier": data['Qualifier'],
                "Date": datetime.datetime.fromordinal(date.toordinal()),
                "DB": data['DB'],
                "created_at": datetime.datetime.utcnow(),
                "DB_Object_Name": data['DB_Object_Name'],
                "DB_Object_Type": data['DB_Object_Type'],
                "Evidence": data['Evidence'],
                "Taxon_ID": data['Taxon_ID'],
                "Aspect": data['Aspect']
            }

            collection.update_one({"_id": i}, {'$set': json}, upsert=True)

    print("\nFinished!")
def get_pmids_from_gaf(gaf_file):
    """
        Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    unigoa_file = open(gaf_file)
    pmids = {}
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                pmids[pmid] = None
        
    return list(pmids.keys())
Ejemplo n.º 24
0
def load_entire_goa(src_gpa, start=db.goa_uniprot.count({})):    # load GOA in a flat structure

    logger.info("Countig GeneOntology annotations ...")

    numannots = 0
    with open(src_gpa, 'r') as handler:
        goa = GOA._gpa11iterator(handler)
        for _ in goa:
            numannots += 1
    logger.info("\nLoading %s GO annotations to %s ...\n" % (numannots, dbname))
    with open(src_gpa, 'r') as handler:
        goa = GOA._gpa11iterator(handler)
        for i in tqdm(range(numannots), desc="annotations already processed"):
            data = next(goa)
            date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date()
            if i < start:
                continue
            json = {
                "DB_Object_ID":  data["DB_Object_ID"],
                "Annotation_Properties": data['Annotation_Properties'],
                "With": data['With'],
                "Interacting_taxon_ID": data['Interacting_taxon_ID'],
                "DB:Reference": data['DB:Reference'],
                "Annotation_Extension": data['Annotation Extension'],
                "Assigned_by": data['Assigned_by'],
                "GO_ID": data['GO_ID'],
                "ECO_Evidence_code": data['ECO_Evidence_code'],
                "Qualifier": data['Qualifier'],
                "Date": datetime.datetime.fromordinal(date.toordinal()),
                "DB": data['DB'],
                "created_at": datetime.datetime.utcnow()
            }
            db.goa_uniprot.update_one({
                "_id": i}, {
                '$set': json
            }, upsert=True)
Ejemplo n.º 25
0
    def test_selection_writing(self):
        """Test record_has, and writerec.

        Adapted from Bio.UniProt.GOA.py by Iddo Friedberg [email protected].
        """
        recs = []
        filtered = []

        # Fields to filter
        evidence = {"Evidence": {"ND"}}
        synonym = {"Synonym": {"YA19A_YEAST", "YAL019W-A"}}
        taxon_id = {"Taxon_ID": {"taxon:559292"}}

        # Temporal file to test writerec
        f_number, f_filtered = tempfile.mkstemp()
        os.close(f_number)

        # Open a file and select records as per filter
        with open("UniProt/goa_yeast.gaf", "r") as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)
                # Filtering
                if (
                    GOA.record_has(rec, taxon_id)
                    and GOA.record_has(rec, evidence)
                    and GOA.record_has(rec, synonym)
                ):
                    filtered.append(rec)

        # Check number of filtered records
        self.assertEqual(len(filtered), 3)

        # Write the filtered records to a file using writerec
        with open(f_filtered, "w") as handle:
            # '!gaf-version: 2.1'
            handle.write("!gaf-version: 2.1 \n")  # Adding file header
            for rec in filtered:
                GOA.writerec(rec, handle)

        # Open and read the file containing the filtered records
        recs_ff = []  # Records from filtered file
        with open(f_filtered, "r") as handle:
            for rec in GOA.gafiterator(handle):
                recs_ff.append(rec)

        # Delete test file
        os.remove(f_filtered)

        # Compare, recs saved by writerec and filtered recs
        self.assertEqual(filtered, recs_ff)
Ejemplo n.º 26
0
def main():
    '''
    Get a GAF, Convert, Calculate, and output. 
    '''

    # Use the assessment configuration file to grab the OBO file.
    obo_path, gaf_path = read_config()
    print "Read the config"
    ICHelper.setupGraphs(obo_path)
    print "Graphs are done"
    gaf = GOA._gaf20iterator(open(gaf_path))
    print "I have made the GAF generator"
    data = GAFtoDICT(gaf)
    print "I have made the dictionary"
    WyattClarkIC(data)
    print("IC values created")
Ejemplo n.º 27
0
    def load(self, filename: str, organism_name: str, annotation_level: str):
        """Import data from a GAF file into a Chado database"""

        # Load dependencies
        default_organism = self._load_organism(organism_name)
        features_with_product = set()

        # Loop over all records in the GAF file
        with open(filename) as f:
            for gaf_record in GOA.gafiterator(f):

                # Import this record into the database
                self._load_gaf_record(gaf_record, default_organism,
                                      annotation_level, features_with_product)

        # Commit changes
        self.session.commit()
Ejemplo n.º 28
0
def GOTermCounter(file,ontology,term):
    D={}
    gaf=goa.gafiterator(file)
    for entry in gaf:
        if(entry['GO_ID'] in root_terms or entry['Evidence'] not in EC):
            """print entry['GO_ID']"""
        else:
            if(entry['Aspect']==ontology and term=='GO term'):
                if(entry['GO_ID'] not in D.keys()):
                    D[entry['GO_ID']]=1
                else:
                    D[entry['GO_ID']]+=1
            if(entry['Aspect']==ontology and term=='PMID'):
                for refs in entry['DB:Reference']:
                    if(re.match("PMID",refs)):
                        if(refs not in D.keys()):
                            D[refs]=1
                        else:
                            D[refs]+=1
    return D
Ejemplo n.º 29
0
 def test_gpa_iterator(self):
     """Test GOA GPA file iterator."""
     recs = []
     with open("UniProt/goa_yeast.gpa.59.gpa") as handle:
         for rec in GOA.gpa_iterator(handle):
             recs.append(rec)
     self.assertEqual(len(recs), 300)
     self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPA11FIELDS))
     # Check values of first record
     self.assertEqual(recs[0]["DB"], "UniProtKB")
     self.assertEqual(recs[0]["DB_Object_ID"], "A0A023PXA5")
     self.assertEqual(recs[0]["Qualifier"], ["enables"])
     self.assertEqual(recs[0]["GO_ID"], "GO:0003674")
     self.assertEqual(recs[0]["DB:Reference"], ["GO_REF:0000015"])
     self.assertEqual(recs[0]["ECO_Evidence_code"], "ECO:0000307")
     self.assertEqual(recs[0]["With"], [""])
     self.assertEqual(recs[0]["Interacting_taxon_ID"], "")
     self.assertEqual(recs[0]["Date"], "20030730")
     self.assertEqual(recs[0]["Assigned_by"], "SGD")
     self.assertEqual(recs[0]["Annotation Extension"], [""])
     self.assertEqual(recs[0]["Annotation_Properties"], "go_evidence=ND")
Ejemplo n.º 30
0
 def test_gpi_iterator_one_two(self):
     """Test GOA GPI file iterator, gpi-version: 1.2."""
     recs = []
     with open("UniProt/goa_human_sample.gpi") as handle:
         for rec in GOA.gpi_iterator(handle):
             recs.append(rec)
     self.assertEqual(len(recs), 9)
     self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI12FIELDS))
     # Check values of first record
     self.assertEqual(recs[0]["DB"], "UniProtKB")
     self.assertEqual(recs[0]["DB_Object_ID"], "A0A024R1R8")
     self.assertEqual(recs[0]["DB_Object_Symbol"], "hCG_2014768")
     self.assertEqual(recs[0]["DB_Object_Name"],
                      ["HCG2014768, isoform CRA_a"])
     self.assertEqual(recs[0]["DB_Object_Synonym"], ["hCG_2014768"])
     self.assertEqual(recs[0]["DB_Object_Type"], "protein")
     self.assertEqual(recs[0]["Taxon"], "taxon:9606")
     self.assertEqual(recs[0]["Parent_Object_ID"], "")
     self.assertEqual(recs[0]["DB_Xref"], [""])
     self.assertEqual(recs[0]["Gene_Product_Properties"],
                      ["db_subset=TrEMBL"])
Ejemplo n.º 31
0
 def test_gpa_iterator(self):
     """Test GOA GPA file iterator."""
     recs = []
     with open('UniProt/goa_yeast.gpa.59.gpa', 'r') as handle:
         for rec in GOA.gpa_iterator(handle):
             recs.append(rec)
     self.assertEqual(len(recs), 300)
     self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPA11FIELDS))
     # Check values of first record
     self.assertEqual(recs[0]['DB'], 'UniProtKB')
     self.assertEqual(recs[0]['DB_Object_ID'], 'A0A023PXA5')
     self.assertEqual(recs[0]['Qualifier'], ['enables'])
     self.assertEqual(recs[0]['GO_ID'], 'GO:0003674')
     self.assertEqual(recs[0]['DB:Reference'], ['GO_REF:0000015'])
     self.assertEqual(recs[0]['ECO_Evidence_code'], 'ECO:0000307')
     self.assertEqual(recs[0]['With'], [''])
     self.assertEqual(recs[0]['Interacting_taxon_ID'], '')
     self.assertEqual(recs[0]['Date'], '20030730')
     self.assertEqual(recs[0]['Assigned_by'], 'SGD')
     self.assertEqual(recs[0]['Annotation Extension'], [''])
     self.assertEqual(recs[0]['Annotation_Properties'], 'go_evidence=ND')
Ejemplo n.º 32
0
    def load_dataframe(self, file_resources, npartitions=None):
        go_annotation_dfs = []
        for file in file_resources:
            if ".gaf" in file:
                go_lines = []
                for line in GOA.gafiterator(file_resources[file]):
                    go_lines.append(line)
                go_annotation_dfs.append(pd.DataFrame(go_lines))

        go_annotations = pd.concat(go_annotation_dfs)

        go_terms = pd.DataFrame.from_dict(self.network.nodes,
                                          orient="index",
                                          dtype="object")

        go_annotations["go_name"] = go_annotations["GO_ID"].map(
            go_terms["name"])
        go_annotations["namespace"] = go_annotations["GO_ID"].map(
            go_terms["namespace"])
        go_annotations["is_a"] = go_annotations["GO_ID"].map(go_terms["is_a"])

        return go_annotations
Ejemplo n.º 33
0
def parse_gpi(infile, taxon=''):

    sp_id = defaultdict()

    infile_handle = open(infile, 'r')
    parser = GOAParser.gpi_iterator(infile_handle)

    for rec in parser:
        print rec.keys()
        if not rec.has_key('Gene_Product_Properties'):
            print "This version of the gp information file does not contain all required information"
            sys.exit(1)
        else:
            break

    for rec in parser:
        taxid = rec['Taxon'].split(':')[1].strip()
        db = rec['Gene_Product_Properties'][0].split('=')[1].strip()
        if db.startswith('Swiss-Prot') and taxon == taxid:
            sp_id[rec['DB_Object_ID']] = 1

    return sp_id
Ejemplo n.º 34
0
def split_to_ontologies(handle):
    """Splits a GAF file into three ontology files
    """
    header = "!gaf-version: 2.0\n"
    out_mfo = open("%s.MFO" % handle.name, 'w')
    out_bpo = open("%s.BPO" % handle.name, 'w')
    out_cco = open("%s.CCO" % handle.name, 'w')
    out_bpo.write(header)
    out_mfo.write(header)
    out_cco.write(header)
    for inrec in upg.gafiterator(handle):
        if inrec['Aspect'] == 'F':
            upg.writerec(inrec, out_mfo)
        elif inrec['Aspect'] == 'P':
            upg.writerec(inrec, out_bpo)
        elif inrec['Aspect'] == 'C':
            upg.writerec(inrec, out_cco)
        else:
            raise ValueError, 'unknown ontology aspect %s' % inrec['Aspect']
    out_mfo.close()
    out_bpo.close()
    out_cco.close()
Ejemplo n.º 35
0
 def test_gpi_iterator(self):
     """Test GOA GPI file iterator."""
     recs = []
     with open('UniProt/gp_information.goa_yeast.28.gpi', 'r') as handle:
         for rec in GOA.gpi_iterator(handle):
             recs.append(rec)
     self.assertEqual(len(recs), 300)
     self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI11FIELDS))
     # Check values of first record
     self.assertEqual(recs[0]['DB_Object_ID'], 'A2P2R3')
     self.assertEqual(recs[0]['DB_Object_Symbol'], 'YMR084W')
     self.assertEqual(recs[0]['DB_Object_Name'], [
         'Putative glutamine--fructose'
         '-6-phosphate aminotransferase'
         ' [isomerizing]'
     ])
     self.assertEqual(recs[0]['DB_Object_Synonym'],
                      ['YM084_YEAST', 'YMR084W'])
     self.assertEqual(recs[0]['DB_Object_Type'], 'protein')
     self.assertEqual(recs[0]['Taxon'], 'taxon:559292')
     self.assertEqual(recs[0]['Parent_Object_ID'], '')
     self.assertEqual(recs[0]['DB_Xref'], [''])
     self.assertEqual(recs[0]['Gene_Product_Properties'],
                      ['db_subset=Swiss-Prot'])
Ejemplo n.º 36
0
import Bio.UniProt.GOA as goa
import sys
import Bio.Entrez as ez

"""""
Retrieving protein references from the yeast association file in GAF 2.0 format
according to different criteria
"""


""""
Retrieve all references cited to annotate proteins with Experimental Evidence Codes 
"""
handle = open("gene_association.goa_yeast")  # open the association gene file of the yeast
proteins = goa.gafiterator(handle) # read all records in the file 
Evidences = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"])}
print ("GO-annotated proteins supported by Experimental Evidence Code")
for protein in proteins:
    if goa.record_has(protein, Evidences):
        print(protein['DB:Reference'])

""""
Retrieve all references cited to annotate proteins with Experimental Evidence Codes
in the Molecular Function aspect of GO
"""
handle = open("gene_association.goa_yeast")
proteins = goa.gafiterator(handle) 
Evi_Aspect = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"]), "Aspect":set(["F"])}
print ("GO-annotated proteins supported by Experimental Evidence Code in the Molecular Function Ontology")
for protein in proteins:
    if goa.record_has(protein, Evi_Aspect):
Ejemplo n.º 37
0
Archivo: goa.py Proyecto: prody/ProDy
def parseGAF(database='PDB', **kwargs):
    """Parse a GO Association File (GAF) corresponding to
    a particular database collection into a dictionary 
    for ease of querying.

    See `GAF`_ for more information on the file format

    .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/

    :arg database: name of the database of interest
        default is PDB. Others include UNIPROT and 
        common names of many organisms.
    :type database: str

    :arg filename: filename for the gaf of interest
        default is goa_ and the database name in lower case
        and .gaf.gz
    :type filename: str
    """
    import Bio.UniProt.GOA as GOA

    if not isinstance(database, str):
        raise TypeError('database should be a string')

    database = database.upper()
    filename = kwargs.get('filename', None)
    if filename is None:
        if database == 'UNIPROT':
            filename = 'goa_' + database.lower() + '_all.gaf.gz'
        else:
            filename = 'goa_' + database.lower() + '.gaf'

    data_folder = kwargs.get('data_folder', os.getcwd())

    # If the file doesn't already exist, download it
    gaf = os.path.join(data_folder, filename)
    if not(os.path.exists(gaf) and os.path.getsize(gaf) > 0):
        LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf))
        data_stream = BytesIO()
        ftp_host = 'ftp.ebi.ac.uk'
        ftp = FTP(ftp_host)
        ftp.login()

        try:
            ftp.cwd('pub/databases/GO/goa')
            ftp.cwd(database)
            ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write)
        except:
            raise ValueError('Cannot find the requested GO association file')

        # Logout from FTP server
        ftp.quit()

        zip_data = data_stream.getvalue()
        data_stream.close()

        rawdata = gunzip(zip_data)
        if PY3K:
            rawdata = rawdata.decode()

        with open(filename, 'w') as gaf_fp:
            gaf_fp.write(rawdata)

        LOGGER.info('Download completed for file {0}'.format(filename))

    with open(filename, 'rt') as gaf_fp:
        funcs = defaultdict(list)  # Initialise the dictionary of functions

        # Iterate on each function using Bio.UniProt.GOA library.
        LOGGER.info('Iterating through entries in {0}'.format(gaf))
        for entry in GOA.gafiterator(gaf_fp):
            id = entry.pop('DB_Object_ID')
            funcs[id].append(entry)

    return funcs
    """
    Returns the pmids of the papers this paper cites
    """
    cites_list = []
    handle = ez.efetch("pubmed", id=pmid, retmode="xml")
    pubmed_rec = ez.parse(handle).__next__()
    for ref in pubmed_rec['MedlineCitation']['CommentsCorrectionsList']:
        if ref.attributes['RefType'] == 'Cites':
            cites_list.append(str(ref['PMID']))
    return cites_list


f = open ("papers and citations.txt","w")
st = "GO-annotated proteins supported by IGI evidence (Inferred from Genetic Interaction)\n"
handle = open("gene_association.goa_yeast")
proteins = goa.gafiterator(handle) 
Evi_Aspect = {"Evidence":set(["IGI"])}
for protein in proteins:
    if goa.record_has(protein, Evi_Aspect):
        for p in protein['DB:Reference']:
            if p[:4] == "PMID":
                st += "Main PubMed reference: "+ p +"\n"
                citations = get_citations(p[5:])
                for cit in citations:
                    st += cit + "  "
                st += "\n"
f.write(st)
f.close()