def makeLinksDic(DIR, dataDir='STITCH Data/', fname='9606.actions.v5.0.tsv', verbose=0, quickMode=False, quickModeLimit=1000): ''' 'links' means interactions. Inputs (5): 1. fname, 2. DIR, 3. verbose, 4. quickMode, 5. quickModeLimit, Output (3): Saves three dictionary objects as pickles. 1. ptocDic, 2. ctopDic, 3. pairsToLinksDic, ''' # create dictionary of interactions TicSum = datetime.timedelta(0,0,0) timeStamp = ts() print('\n[%s] Running prototype for \'makeLinksDic\' function.' % str(timeStamp)) # Load file listOfLinks = loadFile(DIR, dataDir, fname, withHeaders=False, verbose=verbose, quickMode=quickMode, quickModeLimit = quickModeLimit) # Create set of proteins, CIDs, and links if verbose > 0: timeStamp = ts() text = '\n[%s] Creating sets of protein names, CIDs, CID-protein pairs, and interaction types (links).' % str(timeStamp) print(text) count = len(listOfLinks) bar = ChargingBar('', max = count) Tic = tic() setProts = [] setCids = [] setPairs = [] setLinks = [] for line in listOfLinks: A, B, link, action, bool, score = line[0], line[1], line[2], line[3], line[4], line[5] if isCid(A): setCids.append(A) else: setProts.append(A) setPairs.append((A,B)) setLinks.append(link) if verbose > 0: bar.next() setProts = set(setProts) setCids = set(setCids) setPairs = set(setPairs) setLinks = set(setLinks) if verbose > 0: bar.finish() Toc = toc(Tic) TicSum += Toc # Prime protein-to-CID dictionary if verbose > 0: timeStamp = ts() print('\n[%s] Priming protein-to-CID dictionary.' % str(timeStamp)) count = len(setProts) bar = ChargingBar('', max = count) Tic = tic() ptocDic = {} for prot in setProts: ptocDic[prot] = [] if verbose > 0: bar.next() if verbose > 0: print('') Toc = toc(Tic) TicSum += Toc bar.finish() # Prime CID-to-proteins dictionary if verbose > 0: timeStamp = ts() print('\n[%s] Priming CID-to-proteins dictionary.' % str(timeStamp)) count = len(setCids) bar = ChargingBar('', max = count) Tic = tic() ctopDic = {} for cid in setCids: ctopDic[cid] = [] bar.next() if verbose > 0: print('') Toc = toc(Tic) TicSum += Toc bar.finish() # Prime (CID-protein pair)-to-(link type) dictionary if verbose > 0: timeStamp = ts() print('\n[%s] Priming (CID-protein pair)-to-(link type) dictionary.' % str(timeStamp)) count = len(setPairs) bar = ChargingBar('', max = count) Tic = tic() pairsToLinksDic = {} for pair in setPairs: pairsToLinksDic[pair] = [] bar.next() if verbose > 0: print('') Toc = toc(Tic) TicSum += Toc bar.finish() # Populate dictionaries if verbose > 0: timeStamp = ts() print('\n[%s] Populating protein-to-CIDs and CID-to-proteins dictionaries.' % str(timeStamp)) count = len(listOfLinks) bar = ChargingBar('', max = count) Tic = tic() for line in listOfLinks: A, B, link, action, bool, score = line[0], line[1], line[2], line[3], line[4], line[5] if isCid(A): ctopDic[A].append(B) else: ptocDic[A].append(B) pairsToLinksDic[(A,B)].append((link, action, bool, score)) if verbose > 0: bar.next() if verbose > 0: bar.finish() Toc = toc(Tic) TicSum += Toc # Pickle dictionaries if verbose > 0: timeStamp = ts() print('\n[%s] Pickling dictionaries.' % str(timeStamp)) Tic = tic() pname = DIR + 'ptocDic.pickle' with open(pname, 'wb') as handle: pickle.dump(ptocDic, handle, protocol=pickle.HIGHEST_PROTOCOL) pname = DIR + 'ctopDic.pickle' with open(pname, 'wb') as handle: pickle.dump(ctopDic, handle, protocol=pickle.HIGHEST_PROTOCOL) pname = DIR + 'pairsToLinksDic.pickle' with open(pname, 'wb') as handle: pickle.dump(pairsToLinksDic, handle, protocol=pickle.HIGHEST_PROTOCOL) Toc = toc(Tic) TicSum += Toc # Verbose exit if verbose > 0: timeStamp = ts() print('\n[%s] Done running \'makeProtSynsDic\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum))) if not quickMode and verbose: beep() # Return statement return ctopDic, ptocDic, pairsToLinksDic
def makeProtSynsDic(DIR, dataDir='STITCH Data', fname='/9606.protein.aliases.v10.5.txt', verbose=0, quickMode=False, quickModeLimit=250000): ''' Docstring ''' # Verbose start if verbose > 0: TicSum = datetime.timedelta(0,0,0) timeStamp = ts() print('\n[%s] Running \'makeProtSynsDic\' function.' % str(timeStamp)) protsyns = loadFile(DIR=DIR, dataDir=dataDir, fname=fname, withHeaders=False, verbose=verbose, quickMode=quickMode, quickModeLimit = quickModeLimit) # v10.5.txt has 48,366,210 lines that are read in 1h 15m. # create set of protein names and aliases if verbose > 0: timeStamp = ts() text = '\n[%s] Creating sets of protein names and aliases.' % str(timeStamp) print(text) count = len(protsyns) bar = ChargingBar('', max = count) Tic = tic() prots = [] aliases = [] for row in protsyns: name, alias = row[0], row[1] prots.append(name) aliases.append(alias) if verbose > 0: bar.next() prots = set(prots) aliases = set(aliases) if verbose > 0: bar.finish() Toc = toc(Tic) TicSum += Toc # Prime dictionary of protein name aliases if verbose > 0: timeStamp = ts() print('\n[%s] Priming protein synonyms dictionary.' % str(timeStamp)) count = len(prots) bar = ChargingBar('', max = count) Tic = tic() protSynsDic = {} for prot in prots: protSynsDic[prot] = [] if verbose > 0: bar.next() if verbose > 0: print('') Toc = toc(Tic) TicSum += Toc bar.finish() # Priming reverse look-up dictionary if verbose > 0: timeStamp = ts() print('\n[%s] Priming protein synonym reverse look-up dictionary.' % str(timeStamp)) count = len(aliases) bar = ChargingBar('', max = count) Tic = tic() protSynsRDic = {} for alias in aliases: protSynsRDic[alias] = [] bar.next() if verbose > 0: print('') Toc = toc(Tic) TicSum += Toc bar.finish() # Populate dictionaries # this takes the most time. For all proteins, it takes 1 hour per percentage point. if verbose > 0: timeStamp = ts() print('\n[%s] Populating protein synonym and alias dictionaries.' % str(timeStamp)) count = len(protsyns) bar = ChargingBar('', max = count) Tic = tic() for line in protsyns: name, alias, source = line[0], line[1], line[2] # line format is : name alias source protSynsDic[name].append((alias, source)) protSynsRDic[alias].append((name, source)) if verbose > 0: bar.next() if verbose > 0: bar.finish() Toc = toc(Tic) TicSum += Toc # Pickle dictionaries if verbose > 0: timeStamp = ts() print('\n[%s] Pickling dictionaries.' % str(timeStamp)) Tic = tic() pname = DIR + 'protSynsDic.pickle' with open(pname, 'wb') as handle: pickle.dump(protSynsDic, handle, protocol=pickle.HIGHEST_PROTOCOL) pname = DIR + 'protSynsRDic.pickle' with open(pname, 'wb') as handle: pickle.dump(protSynsRDic, handle, protocol=pickle.HIGHEST_PROTOCOL) Toc = toc(Tic) TicSum += Toc # Verbose exit if verbose > 0: timeStamp = ts() print('\n[%s] Done running \'makeProtSynsDic\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum))) if not quickMode and verbose: beep() return protSynsDic, protSynsRDic
def downloadCidSyns(cidList, mineType='synonyms', alarm=alarm1, verbose=0): ''' Download all synonyms for a given list of PubChem Compound ID (CID) numbers using PubChem's PUG REST utility. It takes less than 10 minutes to download all the synonyms for the STITCH CPI database on residential broadband. INPUT: cidList, a list, the list of CIDs. alarm, an object (default: alarm = alarm1). This object will be called at the end of the function, when all the synonyms are finished downloading. By default \'alarm1\' is called, which is a series of high and low pitch tones executed in Bash using the operating system. OUTPUT: requestResults, a BeautifulSoup4 object. ''' # verbose start if verbose > 0: timeStamp = ts() print('\n[%s] Running \'downloadCidSyns\' function.' % str(timeStamp)) TicSum = datetime.timedelta(0,0,0) Tic = tic() mineType = mineType.lower() UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15' Headers = {'User-Agent': UserAgent} cidList = [int(cid[-8:]) for cid in cidList] chunkSize = 190 # chunkSize is the number of length-9 CIDs (plus comma) that can fit into a URL, minus the approximately 100 other characters for the PUG request to PubChem servers. numChunks = np.ceil(len(cidList)/chunkSize) requestResults = [] # Loop miscellanea if verbose > 0: timeStamp = ts() print('\n[%s] Downloading CID information.' % str(timeStamp)) bar = ChargingBar('', max = numChunks) i = 0 # iterations, number of server requests j1 = 0 # index dummy variable Tic = tic() # Loop for num in np.arange(numChunks): j0 = j1 # starting position index j1 = int(min( len(cidList), (num+1) * chunkSize)) # ending position index tempList = cidList[j0:j1] cidChunk = ','.join([str(element) for element in tempList]) if mineType == 'synonyms': URL = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+cidChunk+'/synonyms/XML' elif mineType == 'properties': URL = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+cidChunk+'/property/AtomStereoCount,DefinedAtomStereoCount,UndefinedAtomStereoCount,BondStereoCount,DefinedBondStereoCount,UndefinedBondStereoCount/XML' R = requests.get(URL, headers = Headers) soup = BeautifulSoup(R.text, 'lxml') if mineType == 'synonyms': Found = soup.findAll('information') elif mineType == 'properties': Found = soup.findAll('properties') requestResults.extend(Found) # append results for processing later # PubChem requires that no more than 5 requests be made per second i += 1 Toc = toc(Tic, mute=True) if Toc.total_seconds() <= 1: if i == 5: time.sleep(1) i = 0 Tic = tic() TicSum += Toc elif i < 5: pass elif i > 5: print('This isn\'t supposed to happen!') elif Toc.total_seconds() > 1: i = 1 Tic = tic() TicSum += Toc # Progress bar if verbose > 0: bar.next() if verbose > 0: bar.finish() if verbose > 0: Toc = toc(Tic) TicSum += Toc if verbose > 1: alarm() # Verbose finish if verbose > 0: timeStamp = ts() print('\n[%s] Done running \'downloadCidSyns\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum))) return requestResults
def makeCpiDic(DIR, dataDir, outDir, fname='9606.protein_chemical.links.v5.0.tsv', verbose=0, quickMode=False, quickModeLimit=.1): ''' The code is a clone of makeCidList, but the if-block inside the for loop is different. ''' # verbose start if verbose > 0: timeStamp = ts() print('\n[%s] Running prototype for \'makeCpiDic\' function.' % str(timeStamp)) TicSum = datetime.timedelta(0,0,0) Tic = tic() fpath = dataDir + fname # Get progress bar length if verbose > 0: timeStamp = ts() print('\n[%s] Getting progress bar length.' % str(timeStamp)) Tic = tic() if verbose > 0: count = getNumLines(fpath, verbose = 0) if verbose > 0: Toc = toc(Tic) TicSum += Toc # Validate quickModeLimit if quickMode: quickModeLimit = getQuickModeLimit(quickModeLimit, count) # Main block with open(fpath, 'r') as f: # create generator if quickMode: lineGenerator = (f.readline() for _ in range(quickModeLimit)) else: lineGenerator = (f.readline() for _ in range(count)) headers=next(lineGenerator) # initialize list cpiDic = {} cpiRDic = {} # The loop depends on the file being used. # fileOrigin = getFileOrigin(fname) # not implemented fileOrigin = 'simpleLinks' # Verbose prelude to if block if verbose > 0: timeStamp = ts() print('\n[%s] Splitting lines.' % str(timeStamp)) Tic = tic() if verbose > 0: if quickMode: bar = ChargingBar('', max = quickModeLimit) else: bar = ChargingBar('', max = count) # If block if fileOrigin == 'simpleLinks': for line in lineGenerator: cid, c2, c3 = line.split() try: cpiDic[cid].append(c2) except KeyError: cpiDic[cid] = [c2] # RDic try: cpiRDic[c2].append(cid) except KeyError: cpiRDic[c2] = [cid] if verbose > 0: bar.next() elif fileOrigin == 'detailedLinks': for line in lineGenerator: cid, c2, c3, c4, c5, c6, c7 = line.split() try: cpiDic[cid].append(c2) except KeyError: cpiDic[cid] = [c2] # RDic try: cpiRDic[c2].append(cid) except KeyError: cpiRDic[c2] = [cid] if verbose > 0: bar.next() elif fileOrigin == 'actions': for line in lineGenerator: a, b, c3, c4, c5, c6 = line.split() if isCid(a): try: cpiDic[a].append(b) except KeyError: cpiDic[a] = [b] # RDic try: cpiRDic[b].append(a) except KeyError: cpiRDic[b] = [a] elif isCid(b): try: cpiDic[b].append(a) except KeyError: cpiDic[b] = [a] # RDic try: cpiRDic[a].append(b) except KeyError: cpiRDic[a] = [b] else: print('This should not be happening!') sys.exit(0) if verbose > 0: bar.next() if verbose > 0: bar.finish() Toc = toc(Tic) TicSum += Toc # Verbose finish if verbose > 0: timeStamp = ts() print('\n[%s] Done running protoype for \'makeCpiDic\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum))) # return statement withHeaders = False if withHeaders: return headers, cpiDic else: return cpiDic
def loadFile(DIR, dataDir, fname, withHeaders=False, verbose=0, quickMode=False, quickModeLimit = 10): ''' Reads tab-delimited file and returns it as a list of lists. Scroll to bottom of Docstring for loading times. INPUT: DIR, string, the directory where the STITCH datasets folder are located. verbose, interger. 0 -> no verbose output, any value greater than 0 will print progress feedback, and including a progress bar (if not in quickMode) quickMode, boolean. If true, will run a shorter number of iterations as determined by \'quickModeLimit\'. quickModeLimit, interger. The number of lines from the file to read. Default 10. OUTPUT: array, a Numpy array Loading times: ============================================================================ 9606.actions.v5.0.tsv --------------------- Takes 16:20 (mm:ss) to load all 22 million human chemical-protein interactions. ------------------------- protein.aliases.v10.5.txt ------------------------- Takes 48:21 (mm:ss) to load all 48 million protein aliases. ============================================================================ >>> import psutil # or psutils? >>> mem = psutil.virtual_memory() >>> THRESHOLD = 100 * 1024 * 1024 # 100MB >>> if mem.available <= THRESHOLD: ... print("warning") ''' # verbose start if verbose > 0: timeStamp = ts() print('\n[%s] Running \'loadFile\' function.' % str(timeStamp)) TicSum = datetime.timedelta(0,0,0) Tic = tic() fpath = DIR + dataDir + fname # Get progress bar length if verbose > 0: timeStamp = ts() print('\n[%s] Getting progres bar length.' % str(timeStamp)) if not quickMode and verbose: with open(fpath) as f: lines0 = (f.readline().splitlines()[0] for line in f) count = sum(1 for line in lines0) if verbose > 0: Toc = toc(Tic) TicSum += Toc # Main block with open(fpath) as f: # Read file if verbose > 0: timeStamp = ts() print('\n[%s] Reading file.' % str(timeStamp)) if quickMode: lines = (f.readline().splitlines()[0] for line in range(quickModeLimit)) else: lines = (f.readline().splitlines()[0] for line in f) headers = next(lines) if verbose > 0: Toc = toc(Tic) TicSum += Toc # Split lines if verbose > 0: timeStamp = ts() print('\n[%s] Splitting lines' % str(timeStamp)) Tic = tic() if not quickMode and verbose: bar = ChargingBar('', max = count) lists = [] for line in lines: row = line.split('\t') lists.append(row) if not quickMode and verbose: bar.next() if not quickMode and verbose: bar.finish() if verbose > 0: Toc = toc(Tic) TicSum += Toc timeStamp = ts() print('\n[%s] Done running \'loadFile\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum))) if not quickMode and verbose: beep() # return statement if withHeaders: return [[headers]]+lists else: return lists
def testDicCompleteness_full(forwardDic, reverseDic): ''' Docstring ''' # forwardDic numEntities = len(forwardDic) # Progress bar if True: timeStamp = ts() print('\n[%s] Testing all forward lookup dictionary contents' % str(timeStamp)) count = numEntities bar = ChargingBar('', max=count) Tic = tic() # Test all forward lookup dictionary contents forwardMisses = 0 entityCount = 0 gate = False for entity, list in forwardDic.items(): entityCount += 1 for element in list: alias = getName(element) reverseList = reverseDic[alias] numResults = 0 for element in reverseList: resultEntity = getName(element) result = resultEntity == entity numResults += result if numResults == 0: forwardMisses += 1 Input = input('A reverse lookup failed. Show context? y/n') if Input.lower() == 'y': print( 'While mapping alias %s back to entity %s, the result was instead %s' % (alias, entity, resultEntity)) Input = input('Exit test? y/n') if Input.lower() == 'y': gate = True break if gate: break bar.next() if gate: break bar.finish() print('%d look-ups failed.' % forwardMisses) print('The forward look-up dictionary had %d entries. %d were tested.' % (numEntities, entityCount)) # reverseDic numAliases = len(reverseDic) # Progress bar if True: timeStamp = ts() print('\n[%s] Testing all reverse lookup dictionary contents' % str(timeStamp)) count = numAliases bar = ChargingBar('', max=count) Tic = tic() # Test all reverse lookup dictionary contents reverseMisses = 0 aliasCount = 0 gate = False for alias, list in reverseDic.items(): aliasCount += 1 for element in list: entity = getName(element) reverseList = forwardDic[entity] numResults = 0 for element in reverseList: resultAlias = getName(element) result = resultAlias == alias numResults += result if numResults == 0: reverseMisses += 1 Input = input('A reverse lookup failed. Show context? y/n') if Input.lower() == 'y': print( 'While mapping entity %s back to alias %s, the result was instead %s' % (entity, alias, resultAlias)) Input = input('Exit test? y/n') if Input.lower() == 'y': gate = True break if gate: break bar.next() if gate: break bar.finish() print('%d reverse look-ups failed.' % reverseMisses) print('The reverse look-up dictionary has %d entries. %d were tested.' % (numAliases, aliasCount)) if (forwardMisses + reverseMisses) == 0: print( 'The tested dictionaries are complete in both the forward and backward directions.' ) else: print( 'Warning. The dictionaries failed some forward or reverse lookups.' )