forked from idoerg/Uniprot-Bias
-
Notifications
You must be signed in to change notification settings - Fork 0
/
print_ListPapersThatAnnotateMostProtsLO.py
61 lines (50 loc) · 2.5 KB
/
print_ListPapersThatAnnotateMostProtsLO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import cPickle
import sp_tools
from time import clock
import datetime
import sys
# Usage:
# python ./print_ListPapersThatAnnotateMostProtsLO.py > ListPapersThatAnnotateMostProtsLO.log 2>&1
d = datetime.date.today()
finalOutputFile = "allExpPaperLeavesOnlyInfoTop50." + str(d) + ".tsv"
#load up the pre-pickled data
print "load up the pre-pickled data"
papers_protsExp_handle = open('Uniprot-Bias/goa_exp_papers_prots.pik', 'rb')
papers_protsExp_dict = cPickle.load(papers_protsExp_handle)
#all_tt_countLO_handle = open("Uniprot-Bias/all_tt_count.pik", 'rb')
#all_tt_countLO_dict = cPickle.load(all_tt_count_handle)
papersExpLO_handle = open('Uniprot-Bias/goa_exp_papers_lo.pik', 'rb') #leaves only
papersExpLO_dict = cPickle.load(papersExpLO_handle)
# going for the list of papers That annotate most proteins. Top designates how far down the list we go
top = 50
print "top_papers_dict: Get all the PMID info for the top papers"
print clock()
sys.stdout.flush()
papers_annots2_dict = sp_tools.top_papers_dict(papersExpLO_dict, papers_protsExp_dict, top=top)
print "term_types_all_papers: Count up all the terms types for each paper"
print clock()
sys.stdout.flush()
all_tt_count = sp_tools.term_types_all_papers(papersExpLO_dict) #takes a really long time
print "cPikleDump: save that all_tt_count for later"
print clock()
sys.stdout.flush()
sp_tools.cPickleDump(all_tt_count, "Uniprot-Bias/all_tt_countLO.pik")
print "go_terms_with_ec_per_paper: Create a dict that counts up how many times a specific (GO ID, GO Term Text, EvCode) tuple occurs for each paper"
print clock()
sys.stdout.flush()
go_ec_count = sp_tools.go_terms_with_ec_per_paper(papersExpLO_dict, top=top) # this takes a bit of time too
print "ev_codes_all_papers: Calculate the number of times a paper gives a certain experimental evidence code."
print clock()
sys.stdout.flush()
allEvCodes_dict = sp_tools.ev_codes_all_papers(papersExpLO_dict)
print "sort_papers_prots: Sort the dictionary papers_prots according to the number of proteins annotated by a particular paper (PMID)."
print clock()
sys.stdout.flush()
sortedProtsPerPaper_tuple = sp_tools.sort_papers_prots(papers_protsExp_dict)
print "print_paper_per_prots: print out the results of the the top papers per proteins. Final Ouptfile: allExpPaperLeavesOnlyInfoTop50.<date>.tsv"
print clock()
sys.stdout.flush()
sp_tools.print_paper_per_prots_go(papers_annots2_dict, all_tt_count, go_ec_count, allEvCodes_dict,
sortedProtsPerPaper_tuple, finalOutputFile, top=top)
print "all done"
print clock()