예제 #1
0
    def test_selection_writing(self):
        """Test record_has, and writerec.

        Adapted from Bio.UniProt.GOA.py by Iddo Friedberg [email protected].
        """
        recs = []
        filtered = []

        # Fields to filter
        evidence = {"Evidence": {"ND"}}
        synonym = {"Synonym": {"YA19A_YEAST", "YAL019W-A"}}
        taxon_id = {"Taxon_ID": {"taxon:559292"}}

        # Temporal file to test writerec
        f_number, f_filtered = tempfile.mkstemp()
        os.close(f_number)

        # Open a file and select records as per filter
        with open("UniProt/goa_yeast.gaf", "r") as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)
                # Filtering
                if (
                    GOA.record_has(rec, taxon_id)
                    and GOA.record_has(rec, evidence)
                    and GOA.record_has(rec, synonym)
                ):
                    filtered.append(rec)

        # Check number of filtered records
        self.assertEqual(len(filtered), 3)

        # Write the filtered records to a file using writerec
        with open(f_filtered, "w") as handle:
            # '!gaf-version: 2.1'
            handle.write("!gaf-version: 2.1 \n")  # Adding file header
            for rec in filtered:
                GOA.writerec(rec, handle)

        # Open and read the file containing the filtered records
        recs_ff = []  # Records from filtered file
        with open(f_filtered, "r") as handle:
            for rec in GOA.gafiterator(handle):
                recs_ff.append(rec)

        # Delete test file
        os.remove(f_filtered)

        # Compare, recs saved by writerec and filtered recs
        self.assertEqual(filtered, recs_ff)
예제 #2
0
def filter_in_experimental(handle):
    outhandle = open(handle.name + ".exp_evidence", "w")
    outhandle.write('!gaf-version: 2.0\n')
    for inrec in upg.gafiterator(handle):
        if upg.record_has(inrec, {'Evidence': GO_EXP_EC}):
            upg.writerec(inrec, outhandle)
    outhandle.close()
예제 #3
0
def filter_in_IEA(handle):
    outhandle = open(handle.name + ".IEA", "w")
    outhandle.write('!gaf-version: 2.0\n')
    for inrec in upg.gafiterator(handle):
        if inrec['Evidence'] == 'IEA':
            upg.writerec(inrec, outhandle)
    outhandle.close()
예제 #4
0
    def go_enrichment_study(self):
        if self._go_enrichment_study is None:

            # Load the human annotations
            c = 0
            with gzip.open('../DownloadedResources/goa_human.gaf.gz',
                           'rt') as gaf:
                funcs = {}
                for entry in GOA.gafiterator(gaf):
                    c += 1
                    uniprot_id = entry.pop('DB_Object_Symbol')
                    funcs[uniprot_id] = entry
            # Our population is the set of genes we are analysing
            population = self.gene_symbols()
            print("We have %d genes in our population" % len(population))
            # Build associations from functional annotations we got from the gaf file
            associations = {}
            for x in funcs:
                if x not in associations:
                    associations[x] = set()
                associations[x].add(str(funcs[x]['GO_ID']))
            self._go_enrichment_study = \
                GOEnrichmentStudy(population, associations, self._gene_ontology,
                                  propagate_counts=True,
                                  alpha=0.01,
                                  methods=[self.method])
        return self._go_enrichment_study
def pmids_from_gaf(gaf_file):
    """
        Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    
    pmid_go = {}
    unigoa_file = open(gaf_file)
    pmids = {}
    pmid_prot = {}
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                pmids[pmid] = None
                if pmid not in pmid_go:
                    pmid_go[pmid] = [inrec['GO_ID']]
                elif inrec['GO_ID'] not in pmid_go[pmid]:
                    pmid_go[pmid].append(inrec['GO_ID'])
                if pmid not in pmid_prot:
                    pmid_prot[pmid] = [inrec['DB_Object_ID']]
                elif inrec['DB_Object_ID'] not in pmid_prot[pmid]:
                    pmid_prot[pmid].append(inrec['DB_Object_ID'])
        
    return list(pmids.keys()), pmid_go, pmid_prot
def get_ebi(uri):
    '''
    Fetches GOA file for a species from UniProt using Biopython
    Retrurns annotations 
    '''
    data_folder = os.getcwd() + '/data'
    fn = uri.split('/')[-1]
    # Check if the file exists already
    gaf = os.path.join(data_folder, fn)
    if (not os.path.isfile(gaf)):
        # Login to FTP server
        ebi_ftp = FTP('ftp.ebi.ac.uk')
        ebi_ftp.login()  # Logs in anonymously

        # Download
        with open(gaf, 'wb') as fp:
            ebi_ftp.retrbinary(f'RETR {uri}', fp.write)

        # Logout from FTP server
        ebi_ftp.quit()
    # File is a gunzip file, so we need to open it in this way
    with gzip.open(gaf, 'rt') as gaf_fp:
        funcs = {}  # Initialise the dictionary of functions

        # Iterate on each function using Bio.UniProt.GOA library.
        for entry in GOA.gafiterator(gaf_fp):
            uniprot_id = entry.pop('DB_Object_ID')
            funcs[uniprot_id] = entry
    return funcs
예제 #7
0
    def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'):
        # Load the Gene Ontology
        n_comps = metagene_matrix.shape[1]

        self.download_and_cache_resources(
        )  # Download ontology and annotations, if necessary
        gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo')

        # Load the human annotations
        c = 0
        with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf:
            funcs = {}
            for entry in GOA.gafiterator(gaf):
                c += 1
                uniprot_id = entry.pop('DB_Object_Symbol')
                funcs[uniprot_id] = entry

        # Our population is the set of genes we are analysing

        population = self.gene_symbols()
        print("We have %d genes in our population" % len(population))

        # Build associations from functional annotations we got from the gaf file
        associations = {}
        for x in funcs:
            if x not in associations:
                associations[x] = set()
            associations[x].add(str(funcs[x]['GO_ID']))

        gea = GOEnrichmentStudy(population,
                                associations,
                                gene_ontology,
                                propagate_counts=True,
                                alpha=0.05,
                                methods=[method])
        gea_results_by_component = {}
        rankings = self.ranked_genes_by_component(metagene_matrix)
        for ci in range(n_comps):
            study_genes = rankings[ci]
            print('\nComp. %d: %s...' % (ci, str(study_genes[:10])))
            gea_results_by_component[ci] = gea.run_study(study_genes)

        # Get results into a dataframe per component.  Easiest way is to use routine to
        # write a .tsv file, then read back and filter

        gea_results_df_by_component = []
        for ci in range(n_comps):
            ge_df = self._perform_gene_enrichment_analysis_one_component(
                ci, gea_results_by_component, gea)
            if ge_df is not None:
                gea_results_df_by_component += [ge_df]

        # Merge the per-component dataframes into a single one
        gea_all_sig_results_df = pd.DataFrame()
        gea_all_sig_results_df = gea_all_sig_results_df.append(
            gea_results_df_by_component)

        gea_all_sig_results_df.to_csv(self.cache_dir +
                                      '%s_gea_all.tsv' % self.prefix,
                                      sep='\t')
def build_clusters(species):
    """
        Build GO Clusters from a species gene association file. The cluster contains a representative GO term, 
        the proteins annotated to this term and all the papers that those proteins appear in.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    go_clusters = {}
    pmid_go = {}
    pmid_prot = {}
    unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species))
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                if inrec['GO_ID'] not in go_clusters:
                    go_clusters[inrec['GO_ID']] = {'proteins':Set([inrec['DB_Object_ID']]), 'papers':Set([pmid])}
                else:
                    go_clusters[inrec['GO_ID']]['proteins'].add(inrec['DB_Object_ID'])
                    go_clusters[inrec['GO_ID']]['papers'].add(pmid)
                if pmid not in pmid_go:
                    pmid_go[pmid] = Set([inrec['GO_ID']])
                else:
                    pmid_go[pmid].add(inrec['GO_ID'])
                if pmid not in pmid_prot:
                    pmid_prot[pmid] = Set([inrec['DB_Object_ID']])
                else:
                    pmid_prot[pmid].add(inrec['DB_Object_ID'])
    
    pickle_data(go_clusters, os.path.join(CURR_PATH, "Pickled_Data/go_clusters_"+species))
    pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/pmid_go_"+species))
    pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/pmid_prot_"+species))
def build_clusters(species):
    """
        Build GO Clusters from a species gene association file. The cluster contains a representative GO term, 
        the proteins annotated to this term and all the papers that those proteins appear in.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    
    pmid_go_mf = OrderedDict()
    pmid_go_bp = OrderedDict()
    pmid_go_cc = OrderedDict()
    pmid_go = OrderedDict()
    
    pmid_prot_mf = OrderedDict()
    pmid_prot_bp = OrderedDict()
    pmid_prot_cc = OrderedDict()
    pmid_prot = OrderedDict()
    
    go_prot_mf = OrderedDict()
    go_prot_bp = OrderedDict()
    go_prot_cc = OrderedDict()
    go_prot = OrderedDict()
    
    
    unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species))
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                if inrec['Aspect'] == 'P':
                    add_to_pmid(pmid_go_bp, pmid, inrec['GO_ID'])
                    add_to_pmid(pmid_prot_bp, pmid, inrec['DB_Object_ID'])
                    add_to_pmid(go_prot_bp, inrec['GO_ID'], inrec['DB_Object_ID'])
                elif inrec['Aspect'] == 'F':
                    add_to_pmid(pmid_go_mf, pmid, inrec['GO_ID'])
                    add_to_pmid(pmid_prot_mf, pmid, inrec['DB_Object_ID'])
                    add_to_pmid(go_prot_mf, inrec['GO_ID'], inrec['DB_Object_ID'])
                elif inrec['Aspect'] == 'C':
                    add_to_pmid(pmid_go_cc, pmid, inrec['GO_ID'])
                    add_to_pmid(pmid_prot_cc, pmid, inrec['DB_Object_ID'])
                    add_to_pmid(go_prot_cc, inrec['GO_ID'], inrec['DB_Object_ID'])
                add_to_pmid(pmid_go, pmid, inrec['GO_ID'])
                add_to_pmid(pmid_prot, pmid, inrec['DB_Object_ID'])
                add_to_pmid(go_prot, inrec['GO_ID'], inrec['DB_Object_ID'])
                
        
    pickle_data(pmid_go_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_mf_"+species))
    pickle_data(pmid_go_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_cc_"+species))
    pickle_data(pmid_go_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_bp_"+species))
    pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_"+species))

    pickle_data(pmid_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_mf_"+species))
    pickle_data(pmid_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_cc_"+species))
    pickle_data(pmid_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_bp_"+species))
    pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_"+species))
    
    pickle_data(go_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_mf_"+species))
    pickle_data(go_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_cc_"+species))
    pickle_data(go_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_bp_"+species))
    pickle_data(go_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_"+species))
예제 #10
0
 def get_GO_genes_info(self):
     lines = []
     with open(self.gene_ontology_file_path) as file:
         l = GOA.gafiterator(file)
         for line in l:
             lines.append(line)
     go_df = pd.DataFrame(lines)
     return go_df
def pmids_from_gaf(unigoa_file):
    pmids = {}
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                pmids[pmid] = None
    return list(pmids.keys()) # I enforced the list cast here because the dict_key is not subscriptable
예제 #12
0
    def test_gaf_iterator(self):
        """Test GOA GAF file iterator."""
        # Test GAF 2.0
        recs = []
        with open('UniProt/goa_yeast.gaf', 'r') as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 587)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]['DB'], 'UniProtKB')
        self.assertEqual(recs[0]['DB_Object_ID'], 'A0A023PXA5')
        self.assertEqual(recs[0]['DB_Object_Symbol'], 'YAL019W-A')
        self.assertEqual(recs[0]['Qualifier'], [''])
        self.assertEqual(recs[0]['GO_ID'], 'GO:0003674')
        self.assertEqual(recs[0]['DB:Reference'], ['GO_REF:0000015'])
        self.assertEqual(recs[0]['Evidence'], 'ND')
        self.assertEqual(recs[0]['With'], [''])

        # Test GAF 2.1, it has the same fields as GAF 2.0
        recs = []
        with open('UniProt/gene_association.goa_yeast.1.gaf', 'r') as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 300)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]['DB'], 'UniProtKB')
        self.assertEqual(recs[0]['DB_Object_ID'], 'P17536')
        self.assertEqual(recs[0]['DB_Object_Symbol'], 'TPM1')
        self.assertEqual(recs[0]['Qualifier'], [''])
        self.assertEqual(recs[0]['GO_ID'], 'GO:0000001')
        self.assertEqual(recs[0]['DB:Reference'], ['PMID:10652251'])
        self.assertEqual(recs[0]['Evidence'], 'TAS')
        self.assertEqual(recs[0]['With'], [''])
예제 #13
0
    def test_gaf_iterator(self):
        """Test GOA GAF file iterator."""
        # Test GAF 2.0
        recs = []
        with open("UniProt/goa_yeast.gaf") as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 587)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]["DB"], "UniProtKB")
        self.assertEqual(recs[0]["DB_Object_ID"], "A0A023PXA5")
        self.assertEqual(recs[0]["DB_Object_Symbol"], "YAL019W-A")
        self.assertEqual(recs[0]["Qualifier"], [""])
        self.assertEqual(recs[0]["GO_ID"], "GO:0003674")
        self.assertEqual(recs[0]["DB:Reference"], ["GO_REF:0000015"])
        self.assertEqual(recs[0]["Evidence"], "ND")
        self.assertEqual(recs[0]["With"], [""])

        # Test GAF 2.1, it has the same fields as GAF 2.0
        recs = []
        with open("UniProt/gene_association.goa_yeast.1.gaf") as handle:
            for rec in GOA.gafiterator(handle):
                recs.append(rec)

        # Check number of records
        self.assertEqual(len(recs), 300)
        # Check keys are same as predefined fields
        self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS))
        # Check values of first record
        self.assertEqual(recs[0]["DB"], "UniProtKB")
        self.assertEqual(recs[0]["DB_Object_ID"], "P17536")
        self.assertEqual(recs[0]["DB_Object_Symbol"], "TPM1")
        self.assertEqual(recs[0]["Qualifier"], [""])
        self.assertEqual(recs[0]["GO_ID"], "GO:0000001")
        self.assertEqual(recs[0]["DB:Reference"], ["PMID:10652251"])
        self.assertEqual(recs[0]["Evidence"], "TAS")
        self.assertEqual(recs[0]["With"], [""])
예제 #14
0
def load_gaf(filename,
             start=collection.count({})):  # load GOA in a flat structure

    print("Loading %s" % filename)

    collection.create_index("DB_Object_ID")
    collection.create_index("DB")
    collection.create_index("GO_ID")
    collection.create_index("Evidence")
    collection.create_index("Aspect")
    collection.create_index("Date")
    collection.create_index("DB_Object_Symbol")

    with open(filename, 'r') as handler:

        goa_iterator = GOA.gafiterator(handler)

        for i, data in enumerate(goa_iterator):

            if i % 100 == 0:
                sys.stdout.write("\rProcessed annotations\t%s" % i)

            if i < start or (args.exp and data['Evidence'] not in exp_codes):
                continue

            date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date()

            json = {
                "DB_Object_ID": data['DB_Object_ID'],
                "DB_Object_Symbol": data['DB_Object_Symbol'],
                "With": data['With'],
                "Assigned_By": data['Assigned_By'],
                "Annotation_Extension": data['Annotation_Extension'],
                "Gene_Product_Form_ID": data['Gene_Product_Form_ID'],
                "DB:Reference": data['DB:Reference'],
                "GO_ID": data['GO_ID'],
                "Qualifier": data['Qualifier'],
                "Date": datetime.datetime.fromordinal(date.toordinal()),
                "DB": data['DB'],
                "created_at": datetime.datetime.utcnow(),
                "DB_Object_Name": data['DB_Object_Name'],
                "DB_Object_Type": data['DB_Object_Type'],
                "Evidence": data['Evidence'],
                "Taxon_ID": data['Taxon_ID'],
                "Aspect": data['Aspect']
            }

            collection.update_one({"_id": i}, {'$set': json}, upsert=True)

    print("\nFinished!")
예제 #15
0
def extract_taxon(handle, in_taxid):
    """
    Create a GAF file from a single taxon
    """
    header = "!gaf-version: 2.0\n"
    if isinstance(in_taxid, int):
        taxid = str(in_taxid)
    taxid = in_taxid.strip()
    outfile = open("%s.taxon.%s" % (handle.name, taxid), 'w')
    outfile.write(header)
    for inrec in upg.gafiterator(handle):
        if inrec['Taxon_ID'][0].split(':')[1] == taxid:
            upg.writerec(inrec, outfile)
    outfile.close()
def get_pmids_from_gaf(gaf_file):
    """
        Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained.
        @param unigoa_file: Uniprot_GOA association file in gaf format. 
    """
    unigoa_file = open(gaf_file)
    pmids = {}
    for inrec in GOA.gafiterator(unigoa_file):
        for dbref in inrec['DB:Reference']:
            if dbref[:4] == 'PMID':
                pmid = dbref[5:]
                pmids[pmid] = None
        
    return list(pmids.keys())
예제 #17
0
def extract_taxa(handle, taxalist):
    """
    Create a GAF file from multiple taxa
    taxalist is a list of strings of taxid. Don't use list of int
    """
    outfiles = {}
    header = "!gaf-version: 2.0\n"
    for taxid in taxalist:
        outfiles[taxid] = open("%s.taxon.%s" % (handle.name, taxid), 'w')
        outfiles[taxid].write(header)
    for inrec in upg.gafiterator(handle):
        cur_taxid = inrec['Taxon_ID'][0].split(':')[1]
        if cur_taxid in taxalist:
            upg.writerec(inrec, outfiles[cur_taxid])
    for i in outfiles:
        outfiles[i].close()
예제 #18
0
    def load(self, filename: str, organism_name: str, annotation_level: str):
        """Import data from a GAF file into a Chado database"""

        # Load dependencies
        default_organism = self._load_organism(organism_name)
        features_with_product = set()

        # Loop over all records in the GAF file
        with open(filename) as f:
            for gaf_record in GOA.gafiterator(f):

                # Import this record into the database
                self._load_gaf_record(gaf_record, default_organism,
                                      annotation_level, features_with_product)

        # Commit changes
        self.session.commit()
예제 #19
0
def GOTermCounter(file,ontology,term):
    D={}
    gaf=goa.gafiterator(file)
    for entry in gaf:
        if(entry['GO_ID'] in root_terms or entry['Evidence'] not in EC):
            """print entry['GO_ID']"""
        else:
            if(entry['Aspect']==ontology and term=='GO term'):
                if(entry['GO_ID'] not in D.keys()):
                    D[entry['GO_ID']]=1
                else:
                    D[entry['GO_ID']]+=1
            if(entry['Aspect']==ontology and term=='PMID'):
                for refs in entry['DB:Reference']:
                    if(re.match("PMID",refs)):
                        if(refs not in D.keys()):
                            D[refs]=1
                        else:
                            D[refs]+=1
    return D
예제 #20
0
def read_gaf_write_tab(gaf_file, include_mfo, outfile):
    Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])}
    if include_mfo:
        Aspect = {'Aspect':set(['P','F'])}
    else:
        Aspect = {'Aspect':set(['P'])}
    Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])}
    outhandle = open(outfile, 'w')
    ingafhandle = open(gaf_file,'r')
    counter = 0
    for rec in GOA.gafiterator(ingafhandle):
        if GOA.record_has(rec, Aspect):
            if GOA.record_has(rec, Evidence):
                prot = rec['DB_Object_ID']
                go = rec['GO_ID']
                outhandle.write("%s\t%s\n" % (prot, go))
                counter +=1
    ingafhandle.close()
    outhandle.close()
    return(counter)
예제 #21
0
def split_to_ontologies(handle):
    """Splits a GAF file into three ontology files
    """
    header = "!gaf-version: 2.0\n"
    out_mfo = open("%s.MFO" % handle.name, 'w')
    out_bpo = open("%s.BPO" % handle.name, 'w')
    out_cco = open("%s.CCO" % handle.name, 'w')
    out_bpo.write(header)
    out_mfo.write(header)
    out_cco.write(header)
    for inrec in upg.gafiterator(handle):
        if inrec['Aspect'] == 'F':
            upg.writerec(inrec, out_mfo)
        elif inrec['Aspect'] == 'P':
            upg.writerec(inrec, out_bpo)
        elif inrec['Aspect'] == 'C':
            upg.writerec(inrec, out_cco)
        else:
            raise ValueError, 'unknown ontology aspect %s' % inrec['Aspect']
    out_mfo.close()
    out_bpo.close()
    out_cco.close()
예제 #22
0
    def load_dataframe(self, file_resources, npartitions=None):
        go_annotation_dfs = []
        for file in file_resources:
            if ".gaf" in file:
                go_lines = []
                for line in GOA.gafiterator(file_resources[file]):
                    go_lines.append(line)
                go_annotation_dfs.append(pd.DataFrame(go_lines))

        go_annotations = pd.concat(go_annotation_dfs)

        go_terms = pd.DataFrame.from_dict(self.network.nodes,
                                          orient="index",
                                          dtype="object")

        go_annotations["go_name"] = go_annotations["GO_ID"].map(
            go_terms["name"])
        go_annotations["namespace"] = go_annotations["GO_ID"].map(
            go_terms["namespace"])
        go_annotations["is_a"] = go_annotations["GO_ID"].map(go_terms["is_a"])

        return go_annotations
예제 #23
0
def ProteinGafRDF(files, map_ds, output_file):
    assoc_line = 0
    rdf_buffer = ''
    previous_obj_id = ''
    list_records = list()
    uniq_obj_id = {}
    pp = pprint.PrettyPrinter(indent=4)
    #    if flag == 'protein' or flag == 'gene' or flag == 'qtl':
    #        output_file += flag + "_associations.ttl"
    outputWriter = open(output_file, "w")
    # Printing prefixes
    outputWriter.write(base + "\t" + "<" + base_uri + "> .\n")
    outputWriter.write(pr + "\t" + rdf_ns + "<" + rdf + "> .\n")
    outputWriter.write(pr + "\t" + rdfs_ns + "<" + rdfs + "> .\n")
    outputWriter.write(pr + "\t" + owl_ns + "<" + owl + "> .\n")
    outputWriter.write(pr + "\t" + xsd_ns + "<" + xsd + "> .\n")
    outputWriter.write(pr + "\t" + base_vocab_ns + "<" + base_vocab_uri +
                       "> .\n")
    outputWriter.write(pr + "\t" + obo_ns + "<" + obo_uri + "> .\n")
    outputWriter.write(pr + "\t" + sio_ns + "<" + sio_uri + "> .\n")
    #    outputWriter.write(pr + "\t" + ncbi_tax_ns + "<" + ncbi_tax_uri + "> .\n")
    outputWriter.write(pr + "\t" + gr_assoc_ns + "<" + gr_assoc + "> .\n")
    outputWriter.write(pr + "\t" + goa_ns + "<" + goa_uri + "> .\n")
    outputWriter.write(pr + "\t" + up_ns + "<" + uniprot + "> .\n\n")
    #Ajout du prefix pour la realese des donnees
    outputWriter.write(pr + "\t" + res_ns + "<" + resource + "> .\n\n")

    #opener = open(files, "r")
    # Slurping all the gaf records into gaf_objs list
    for infile in files:
        print(infile)
        opener = open(infile, "r")
        gaf_objs = GOA.gafiterator(opener)
        for record in gaf_objs:
            list_records.append(record)  #append(record) extend(record)
        opener.close()
    list_records.sort(key=lambda x: x['DB_Object_ID'])
    #    pp.pprint(list_records)
    # Accessing individual associations
    for inline in list_records:
        taxon = ''.join(inline['Taxon_ID'])
        tax_id = taxon.lstrip('taxon:')
        #        d = inline['Date']
        date = inline['Date'][:4] + "-" + inline['Date'][4:6] + "-" + inline[
            'Date'][6:]
        if tax_id not in taxon_ids:
            continue
        assoc_line += 1
        ont_term = inline['GO_ID'].replace(":", "_")
        current_obj_id = inline['DB_Object_ID']
        aspect = inline['Aspect']
        go_pattern = re.match(r'^GO', ont_term)
        evidence_code = inline['Evidence']
        #        db_ref = inline['DB:Reference']
        #        print db_ref

        # Flush
        if previous_obj_id and current_obj_id not in previous_obj_id:
            rdf_buffer = re.sub(' ;$', ' .', rdf_buffer)
            outputWriter.write(rdf_buffer)
            rdf_buffer = ''

        if current_obj_id not in uniq_obj_id:
            rdf_buffer += up_ns + current_obj_id + "\n"
            rdf_buffer += "\t" + rdf_ns + "type" + "\t" + res_ns + "Protein" + " ;\n"
            #rdf_buffer += "\t" + rdf_ns + "type" + "\t" + owl_ns + "Class" + " ;\n"
            #rdf_buffer += "\t" + rdfs_ns + "subClassOf" + "\t" + obo_ns + protein_term + " ;\n"
            rdf_buffer += "\t" + rdfs_ns + "label" + "\t" + '"%s"' % (
                inline['DB_Object_Symbol']) + " ;\n"
            rdf_buffer += "\t" + base_vocab_ns + "description" + "\t" + '"%s"' % (
                inline['DB_Object_Name']) + " ;\n"
            for synonym in inline['Synonym']:
                if synonym:
                    rdf_buffer += "\t" + base_vocab_ns + "has_synonym" + "\t" + '"%s"' % (
                        synonym) + " ;\n"
            rdf_buffer += "\t" + base_vocab_ns + "taxon" + "\t" + obo_ns + "NCBITaxon_" + tax_id + " ;\n"
            uniq_obj_id[current_obj_id] = 1
            previous_obj_id = current_obj_id

        # Reification
        if go_pattern:
            #            outputWriter.write(goa_ns + current_obj_id + "_" + ont_term + "\n")
            outputWriter.write(goa_ns + current_obj_id + "\n")
        else:
            outputWriter.write(gr_assoc_ns + current_obj_id + "_" + ont_term +
                               "\n")
#        outputWriter.write(base_ns + "triple_" + current_obj_id + "_" + ont_term + "_" + str(assoc_line) + "\n")
        outputWriter.write("\t" + rdf_ns + "type" + "\t" + rdf_ns +
                           "Statement" + " ;\n")
        outputWriter.write("\t" + rdfs_ns + "subClassOf" + "\t" + sio_ns +
                           sio_term + " ;\n")
        outputWriter.write("\t" + rdf_ns + "subject" + "\t" + up_ns +
                           current_obj_id + " ;\n")
        outputWriter.write("\t" + rdf_ns + "predicate" + "\t" + base_vocab_ns +
                           ont_aspects[aspect] + " ;\n")
        outputWriter.write("\t" + rdf_ns + "object" + "\t" + obo_ns +
                           ont_term + " ;\n")
        if evidence_code in map_ds:
            for db_ref in inline['DB:Reference']:
                if db_ref in map_ds[evidence_code]:
                    eco_id = map_ds[evidence_code][db_ref].replace(":", "_")
                    outputWriter.write("\t" + base_vocab_ns + "evidence" +
                                       "\t" + obo_ns + eco_id + " ;\n")
                    outputWriter.write("\t" + base_vocab_ns + "evidence_code" +
                                       "\t" + '"%s"' % (evidence_code) +
                                       " ;\n")
                else:
                    eco_id = map_ds[evidence_code]['Default'].replace(":", "_")
                    outputWriter.write("\t" + base_vocab_ns + "evidence" +
                                       "\t" + obo_ns + eco_id + " ;\n")
                    outputWriter.write("\t" + base_vocab_ns + "evidence_code" +
                                       "\t" + '"%s"' % (evidence_code) +
                                       " ;\n")
        else:
            outputWriter.write("\t" + base_vocab_ns + "evidence_code" + "\t" +
                               '"%s"' % (evidence_code) + " ;\n")


#        outputWriter.write("\t" + base_vocab_ns + "evidence" + "\t" + '"%s"' % (inline['Evidence']) + " ;\n")
        outputWriter.write("\t" + base_vocab_ns + "assigned_by" + "\t" +
                           '"%s"' % (inline['Assigned_By']) + " ;\n")
        outputWriter.write("\t" + base_vocab_ns + "date" + "\t" + '"%s"' %
                           (date) + "^^" + xsd_ns + "date" + " .\n")

        # Flushing
        if current_obj_id == previous_obj_id:
            rdf_buffer += "\t" + base_vocab_ns + ont_aspects[
                aspect] + "\t" + obo_ns + ont_term + " ;\n"
            if go_pattern:
                rdf_buffer += "\t" + base_vocab_ns + "has_annotation" + "\t" + goa_ns + current_obj_id + " ;\n"
            else:
                rdf_buffer += "\t" + base_vocab_ns + "has_annotation" + "\t" + gr_assoc_ns + current_obj_id + "_" + ont_term + " ;\n"

        previous_obj_id = current_obj_id

    # Last Flush
    if previous_obj_id:
        rdf_buffer = re.sub(' ;$', ' .', rdf_buffer)
        outputWriter.write(rdf_buffer)
    outputWriter.close()
    print "Total number of associations: %s\n" % (str(assoc_line))
예제 #24
0
import Bio.UniProt.GOA as goa
import sys
import Bio.Entrez as ez

"""""
Retrieving protein references from the yeast association file in GAF 2.0 format
according to different criteria
"""


""""
Retrieve all references cited to annotate proteins with Experimental Evidence Codes 
"""
handle = open("gene_association.goa_yeast")  # open the association gene file of the yeast
proteins = goa.gafiterator(handle) # read all records in the file 
Evidences = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"])}
print ("GO-annotated proteins supported by Experimental Evidence Code")
for protein in proteins:
    if goa.record_has(protein, Evidences):
        print(protein['DB:Reference'])

""""
Retrieve all references cited to annotate proteins with Experimental Evidence Codes
in the Molecular Function aspect of GO
"""
handle = open("gene_association.goa_yeast")
proteins = goa.gafiterator(handle) 
Evi_Aspect = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"]), "Aspect":set(["F"])}
print ("GO-annotated proteins supported by Experimental Evidence Code in the Molecular Function Ontology")
for protein in proteins:
    if goa.record_has(protein, Evi_Aspect):
예제 #25
0
#!/usr/bin/env python
import sys
import argparse
import target_prep as tp
from Bio.UniProt import GOA as upg
if __name__ == '__main__':
#    parser = argparse.ArgumentParser(description='Filter by field')
#    parser.add_argument('-o','--output')
#    parser.add_argument('-f','--field')
    outhandle = sys.stdout
    if len(sys.argv) == 5:
        outhandle = open(sys.argv[4],"w")
    outhandle.write('!gaf-version: 2.0\n')
    goodvals = {sys.argv[1]: set(sys.argv[2].split(','))}
    for inrec in upg.gafiterator(open(sys.argv[3])):
        if upg.record_has(inrec, goodvals):
            upg.writerec(inrec,outhandle)
예제 #26
0
    return record


if __name__ == '__main__':

    gaf_file = sys.argv[1]
    gpi_file = sys.argv[2]
    taxon = sys.argv[3]

    outfile = open("filtered_gaf_file_with_only_sp_ids_for_" + taxon + ".gaf",
                   'w')
    gaf_handle = open(gaf_file, 'r')
    record = []

    sp_id = parse_gpi(gpi_file, taxon)
    parser = GOAParser.gafiterator(gaf_handle)

    for rec in parser:
        if len(rec) == 15:
            GAFFIELDS = GOAParser.GAF10FIELDS
            break
        elif len(rec) == 17:
            GAFFIELDS = GOAParser.GAF20FIELDS
            break

    for rec in parser:
        record = extract_gaf(rec, outfile, GAFFIELDS, record, sp_id, taxon)

    new_record = tuple(record)
    insert_into_db(new_record, taxon, GAFFIELDS)
예제 #27
0
import pandas as pd
import Bio.UniProt.GOA as GOA


def take_my_Y(synonym_list):
    for gene in synonym_list:
        if gene.startswith('Y'):
            return gene


GO_file = '/home/sergio/workspace_Eclipse/Lucky_GOGO/Results/Python_Srcipts/go-basic.obo'
sc_GAF_file = '/home/sergio/workspace_Eclipse/Lucky_GOGO/Results/Python_Srcipts/sgd.gaf'
go_dag = obo_parser.GODag(GO_file)

with open(sc_GAF_file, 'rt') as fp:
    sc_gaf = pd.DataFrame(annotation for annotation in GOA.gafiterator(fp))

sc_gaf = sc_gaf[sc_gaf['Evidence'].isin(
    ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'])]
sc_gaf = sc_gaf[sc_gaf['Aspect'] == 'F']
sc_gaf['Yeast_ID'] = [take_my_Y(gene) for gene in sc_gaf['Synonym']]

sergio_df = sc_gaf[['Yeast_ID', 'GO_ID']].drop_duplicates()
new_sergio_df = pd.DataFrame(columns=['Yeast_ID', 'GO_ID'])

for _, row in sergio_df.iterrows():
    new_sergio_df = new_sergio_df.append(row)
    parents = pd.DataFrame([
        (row.Yeast_ID, parent_id)
        for parent_id in go_dag[row.GO_ID].get_all_parents()
        if go_dag[parent_id].namespace == 'molecular_function'
    # Check if the file exists already
    if (not os.path.isfile(data_folder + '/go-basic.obo')):
        go_obo = wget.download(go_obo_url, data_folder + '/go-basic.obo')
    else:
        go_obo = data_folder + '/go-basic.obo'

    go = obo_parser.GODag(go_obo)

    methods = ["bonferroni", "fdr"]

    assoc = {}
    with gzip.open(arab_gaf, 'rt') as arab_gaf_fp:
        arab_funcs = {}  # Initialise the dictionary of functions

        # Iterate on each function using Bio.UniProt.GOA library.
        for entry in GOA.gafiterator(arab_gaf_fp):
            uniprot_id = entry.pop('DB_Object_ID')
            arab_funcs[uniprot_id] = entry

    pop = arab_funcs.keys()

    for x in arab_funcs:
        if x not in assoc:
            assoc[x] = set()
        assoc[x].add(str(arab_funcs[x]['GO_ID']))
    target_gene = ["DNAJC19"]
    gene_names = [
        'DLX6', 'MBTD1', 'TRHDE', 'NAALAD2', 'CD82', 'AURKA', 'TEKT2',
        'PYCARD', 'TULP2', 'DLX5', 'QPCT', 'PCDH17', 'DNAJC15', 'CCRL2',
        'CTCFL', 'EML2', 'RIPK3', 'ACY3', 'BTF3L4', 'MSI1', 'LACRT', 'SLC46A3',
        'NOVA1', 'DMRTB1', 'ANKRD31', 'SDK1', 'NAPRT', 'CRB2', 'LRRC4C',
예제 #29
0
파일: goa.py 프로젝트: nffaruk/ProDy
def parseGAF(database='PDB', **kwargs):
    """Parse a GO Association File (GAF) corresponding to
    a particular database collection into a dictionary 
    for ease of querying.

    See `GAF`_ for more information on the file format

    .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/

    :arg database: name of the database of interest
        default is PDB. Others include UNIPROT and 
        common names of many organisms.
    :type database: str

    :arg filename: filename for the gaf of interest
        default is goa_ and the database name in lower case
        and .gaf.gz
    :type filename: str
    """
    import Bio.UniProt.GOA as GOA

    if not isinstance(database, str):
        raise TypeError('database should be a string')

    database = database.upper()
    filename = kwargs.get('filename', None)
    if filename is None:
        if database == 'UNIPROT':
            filename = 'goa_' + database.lower() + '_all.gaf.gz'
        else:
            filename = 'goa_' + database.lower() + '.gaf'

    data_folder = kwargs.get('data_folder', os.getcwd())

    # If the file doesn't already exist, download it
    gaf = os.path.join(data_folder, filename)
    if not (os.path.exists(gaf) and os.path.getsize(gaf) > 0):
        LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf))
        data_stream = BytesIO()
        ftp_host = 'ftp.ebi.ac.uk'
        ftp = FTP(ftp_host)
        ftp.login()

        try:
            ftp.cwd('pub/databases/GO/goa')
            ftp.cwd(database)
            ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write)
        except:
            raise ValueError('Cannot find the requested GO association file')

        # Logout from FTP server
        ftp.quit()

        zip_data = data_stream.getvalue()
        data_stream.close()

        rawdata = gunzip(zip_data)
        if PY3K:
            rawdata = rawdata.decode()

        with open(filename, 'w') as gaf_fp:
            gaf_fp.write(rawdata)

        LOGGER.info('Download completed for file {0}'.format(filename))

    with open(filename, 'rt') as gaf_fp:
        funcs = defaultdict(list)  # Initialise the dictionary of functions

        # Iterate on each function using Bio.UniProt.GOA library.
        LOGGER.info('Iterating through entries in {0}'.format(gaf))
        for entry in GOA.gafiterator(gaf_fp):
            id = entry.pop('DB_Object_ID')
            funcs[id].append(entry)

    return funcs
예제 #30
0
파일: goa.py 프로젝트: prody/ProDy
def parseGAF(database='PDB', **kwargs):
    """Parse a GO Association File (GAF) corresponding to
    a particular database collection into a dictionary 
    for ease of querying.

    See `GAF`_ for more information on the file format

    .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/

    :arg database: name of the database of interest
        default is PDB. Others include UNIPROT and 
        common names of many organisms.
    :type database: str

    :arg filename: filename for the gaf of interest
        default is goa_ and the database name in lower case
        and .gaf.gz
    :type filename: str
    """
    import Bio.UniProt.GOA as GOA

    if not isinstance(database, str):
        raise TypeError('database should be a string')

    database = database.upper()
    filename = kwargs.get('filename', None)
    if filename is None:
        if database == 'UNIPROT':
            filename = 'goa_' + database.lower() + '_all.gaf.gz'
        else:
            filename = 'goa_' + database.lower() + '.gaf'

    data_folder = kwargs.get('data_folder', os.getcwd())

    # If the file doesn't already exist, download it
    gaf = os.path.join(data_folder, filename)
    if not(os.path.exists(gaf) and os.path.getsize(gaf) > 0):
        LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf))
        data_stream = BytesIO()
        ftp_host = 'ftp.ebi.ac.uk'
        ftp = FTP(ftp_host)
        ftp.login()

        try:
            ftp.cwd('pub/databases/GO/goa')
            ftp.cwd(database)
            ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write)
        except:
            raise ValueError('Cannot find the requested GO association file')

        # Logout from FTP server
        ftp.quit()

        zip_data = data_stream.getvalue()
        data_stream.close()

        rawdata = gunzip(zip_data)
        if PY3K:
            rawdata = rawdata.decode()

        with open(filename, 'w') as gaf_fp:
            gaf_fp.write(rawdata)

        LOGGER.info('Download completed for file {0}'.format(filename))

    with open(filename, 'rt') as gaf_fp:
        funcs = defaultdict(list)  # Initialise the dictionary of functions

        # Iterate on each function using Bio.UniProt.GOA library.
        LOGGER.info('Iterating through entries in {0}'.format(gaf))
        for entry in GOA.gafiterator(gaf_fp):
            id = entry.pop('DB_Object_ID')
            funcs[id].append(entry)

    return funcs
    """
    Returns the pmids of the papers this paper cites
    """
    cites_list = []
    handle = ez.efetch("pubmed", id=pmid, retmode="xml")
    pubmed_rec = ez.parse(handle).__next__()
    for ref in pubmed_rec['MedlineCitation']['CommentsCorrectionsList']:
        if ref.attributes['RefType'] == 'Cites':
            cites_list.append(str(ref['PMID']))
    return cites_list


f = open ("papers and citations.txt","w")
st = "GO-annotated proteins supported by IGI evidence (Inferred from Genetic Interaction)\n"
handle = open("gene_association.goa_yeast")
proteins = goa.gafiterator(handle) 
Evi_Aspect = {"Evidence":set(["IGI"])}
for protein in proteins:
    if goa.record_has(protein, Evi_Aspect):
        for p in protein['DB:Reference']:
            if p[:4] == "PMID":
                st += "Main PubMed reference: "+ p +"\n"
                citations = get_citations(p[5:])
                for cit in citations:
                    st += cit + "  "
                st += "\n"
f.write(st)
f.close()
        
        
        
예제 #32
0
DATA_DIR = "/data/dd-analysis"

#LOAD DATABASE ANNOTATIONS
refseq_genes = pd.read_csv("/data/genomes/annotations/refseq_genes_export.csv",
                           delimiter="\t")
entrez = pd.read_csv(
    "/data/genomes/annotations/Homo_sapiens.GRCh38.95.entrez.tsv",
    delimiter="\t")
refseq_xref = pd.read_csv(
    "/data/genomes/annotations/Homo_sapiens.GRCh38.95.refseq.tsv",
    delimiter="\t")

#LOAD GENE ONTOLOGIES FOR HG38
from Bio.UniProt import GOA
fopen = open("/data/genomes/annotations/goa_human.gaf")
itr = GOA.gafiterator(fopen)
records = list(itr)
ontologies = pd.DataFrame.from_dict(records)


def init_go_terms(tmpfolder, dataset):
    dsname = dataset["dataset"]

    print(tmpfolder, dataset)

    #READ TRANSCRIPT ALIGNMENTS
    samfile = pysam.AlignmentFile(
        "/data/dd-analysis/datasets/{}/tophat/accepted_hits.bam".format(
            dsname), "rb")
    all_alignments = [a for a in samfile]
    names = [e.reference_name for e in all_alignments]