Ejemplo n.º 1
0
def makeLinksDic(DIR, dataDir='STITCH Data/', fname='9606.actions.v5.0.tsv', verbose=0, quickMode=False, quickModeLimit=1000):
    '''
    'links' means interactions.

    Inputs (5):
      1. fname,
      2. DIR,
      3. verbose,
      4. quickMode,
      5. quickModeLimit,

    Output (3):
      Saves three dictionary objects as pickles.
      1. ptocDic,
      2. ctopDic,
      3. pairsToLinksDic,
    '''
    # create dictionary of interactions
    TicSum = datetime.timedelta(0,0,0)

    timeStamp = ts()
    print('\n[%s] Running prototype for \'makeLinksDic\' function.' % str(timeStamp))

    # Load file
    listOfLinks = loadFile(DIR, dataDir, fname, withHeaders=False, verbose=verbose, quickMode=quickMode, quickModeLimit = quickModeLimit)

    # Create set of proteins, CIDs, and links
    if verbose > 0:
        timeStamp = ts()
        text = '\n[%s] Creating sets of protein names, CIDs, CID-protein pairs, and interaction types (links).' % str(timeStamp)
        print(text)
        count = len(listOfLinks)
        bar = ChargingBar('', max = count)
        Tic = tic()
    setProts = []
    setCids = []
    setPairs = []
    setLinks = []
    for line in listOfLinks:
        A, B, link, action, bool, score = line[0], line[1], line[2], line[3], line[4], line[5]
        if isCid(A):
            setCids.append(A)
        else:
            setProts.append(A)
        setPairs.append((A,B))
        setLinks.append(link)
        if verbose > 0:
            bar.next()
    setProts = set(setProts)
    setCids = set(setCids)
    setPairs = set(setPairs)
    setLinks = set(setLinks)
    if verbose > 0:
        bar.finish()
        Toc = toc(Tic)
        TicSum += Toc

    # Prime protein-to-CID dictionary
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Priming protein-to-CID dictionary.' % str(timeStamp))
        count = len(setProts)
        bar = ChargingBar('', max = count)
        Tic = tic()
    ptocDic = {}
    for prot in setProts:
        ptocDic[prot] = []
        if verbose > 0:
            bar.next()
    if verbose > 0:
        print('')
        Toc = toc(Tic)
        TicSum += Toc
        bar.finish()

    # Prime CID-to-proteins dictionary
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Priming CID-to-proteins dictionary.' % str(timeStamp))
        count = len(setCids)
        bar = ChargingBar('', max = count)
        Tic = tic()
    ctopDic = {}
    for cid in setCids:
        ctopDic[cid] = []
        bar.next()
    if verbose > 0:
        print('')
        Toc = toc(Tic)
        TicSum += Toc
        bar.finish()

    # Prime (CID-protein pair)-to-(link type) dictionary
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Priming (CID-protein pair)-to-(link type) dictionary.' % str(timeStamp))
        count = len(setPairs)
        bar = ChargingBar('', max = count)
        Tic = tic()
    pairsToLinksDic = {}
    for pair in setPairs:
        pairsToLinksDic[pair] = []
        bar.next()
    if verbose > 0:
        print('')
        Toc = toc(Tic)
        TicSum += Toc
        bar.finish()

    # Populate dictionaries
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Populating protein-to-CIDs and CID-to-proteins dictionaries.' % str(timeStamp))
        count = len(listOfLinks)
        bar = ChargingBar('', max = count)
        Tic = tic()
    for line in listOfLinks:
        A, B, link, action, bool, score = line[0], line[1], line[2], line[3], line[4], line[5]
        if isCid(A):
            ctopDic[A].append(B)
        else:
            ptocDic[A].append(B)
        pairsToLinksDic[(A,B)].append((link, action, bool, score))
        if verbose > 0:
            bar.next()
    if verbose > 0:
        bar.finish()
        Toc = toc(Tic)
        TicSum += Toc

    # Pickle dictionaries
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Pickling dictionaries.' % str(timeStamp))
        Tic = tic()
    pname = DIR + 'ptocDic.pickle'
    with open(pname, 'wb') as handle:
        pickle.dump(ptocDic, handle, protocol=pickle.HIGHEST_PROTOCOL)
    pname = DIR + 'ctopDic.pickle'
    with open(pname, 'wb') as handle:
        pickle.dump(ctopDic, handle, protocol=pickle.HIGHEST_PROTOCOL)
    pname = DIR + 'pairsToLinksDic.pickle'
    with open(pname, 'wb') as handle:
        pickle.dump(pairsToLinksDic, handle, protocol=pickle.HIGHEST_PROTOCOL)
    Toc = toc(Tic)
    TicSum += Toc

    # Verbose exit
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Done running \'makeProtSynsDic\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum)))
    if not quickMode and verbose:
        beep()

    # Return statement
    return ctopDic, ptocDic, pairsToLinksDic
Ejemplo n.º 2
0
def makeProtSynsDic(DIR, dataDir='STITCH Data', fname='/9606.protein.aliases.v10.5.txt', verbose=0, quickMode=False, quickModeLimit=250000):
    '''
    Docstring
    '''

    # Verbose start
    if verbose > 0:
        TicSum = datetime.timedelta(0,0,0)
        timeStamp = ts()
        print('\n[%s] Running \'makeProtSynsDic\' function.' % str(timeStamp))

    protsyns = loadFile(DIR=DIR, dataDir=dataDir, fname=fname, withHeaders=False, verbose=verbose, quickMode=quickMode, quickModeLimit = quickModeLimit)
    # v10.5.txt has 48,366,210 lines that are read in 1h 15m.

    # create set of protein names and aliases
    if verbose > 0:
        timeStamp = ts()
        text = '\n[%s] Creating sets of protein names and aliases.' % str(timeStamp)
        print(text)
        count = len(protsyns)
        bar = ChargingBar('', max = count)
        Tic = tic()
    prots = []
    aliases = []
    for row in protsyns:
        name, alias = row[0], row[1]
        prots.append(name)
        aliases.append(alias)
        if verbose > 0:
            bar.next()
    prots = set(prots)
    aliases = set(aliases)
    if verbose > 0:
        bar.finish()
        Toc = toc(Tic)
        TicSum += Toc

    # Prime dictionary of protein name aliases
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Priming protein synonyms dictionary.' % str(timeStamp))
        count = len(prots)
        bar = ChargingBar('', max = count)
        Tic = tic()
    protSynsDic = {}
    for prot in prots:
        protSynsDic[prot] = []
        if verbose > 0:
            bar.next()
    if verbose > 0:
        print('')
        Toc = toc(Tic)
        TicSum += Toc
        bar.finish()

    # Priming reverse look-up dictionary
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Priming protein synonym reverse look-up dictionary.' % str(timeStamp))
        count = len(aliases)
        bar = ChargingBar('', max = count)
        Tic = tic()
    protSynsRDic = {}
    for alias in aliases:
        protSynsRDic[alias] = []
        bar.next()
    if verbose > 0:
        print('')
        Toc = toc(Tic)
        TicSum += Toc
        bar.finish()

    # Populate dictionaries
    # this takes the most time. For all proteins, it takes 1 hour per percentage point.
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Populating protein synonym and alias dictionaries.' % str(timeStamp))
        count = len(protsyns)
        bar = ChargingBar('', max = count)
        Tic = tic()
    for line in protsyns:
        name, alias, source = line[0], line[1], line[2] # line format is : name alias source
        protSynsDic[name].append((alias, source))
        protSynsRDic[alias].append((name, source))
        if verbose > 0:
            bar.next()
    if verbose > 0:
        bar.finish()
        Toc = toc(Tic)
        TicSum += Toc

    # Pickle dictionaries
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Pickling dictionaries.' % str(timeStamp))
        Tic = tic()
    pname = DIR + 'protSynsDic.pickle'
    with open(pname, 'wb') as handle:
        pickle.dump(protSynsDic, handle, protocol=pickle.HIGHEST_PROTOCOL)
    pname = DIR + 'protSynsRDic.pickle'
    with open(pname, 'wb') as handle:
        pickle.dump(protSynsRDic, handle, protocol=pickle.HIGHEST_PROTOCOL)
    Toc = toc(Tic)
    TicSum += Toc

    # Verbose exit
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Done running \'makeProtSynsDic\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum)))
    if not quickMode and verbose:
        beep()

    return protSynsDic, protSynsRDic
Ejemplo n.º 3
0
def downloadCidSyns(cidList, mineType='synonyms', alarm=alarm1, verbose=0):
    '''
    Download all synonyms for a given list of PubChem Compound ID (CID) numbers using PubChem's PUG REST utility. It takes less than 10 minutes to download all the synonyms for the STITCH CPI database on residential broadband.

    INPUT:  cidList, a list, the list of CIDs.
            alarm, an object (default: alarm = alarm1). This object will be called at the end of the function, when all the synonyms are finished downloading. By default \'alarm1\' is called, which is a series of high and low pitch tones executed in Bash using the operating system.
    OUTPUT: requestResults, a BeautifulSoup4 object.
    '''
    # verbose start
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Running \'downloadCidSyns\' function.' % str(timeStamp))
        TicSum = datetime.timedelta(0,0,0)
        Tic = tic()

    mineType = mineType.lower()

    UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15'
    Headers = {'User-Agent': UserAgent}

    cidList = [int(cid[-8:]) for cid in cidList]

    chunkSize = 190 # chunkSize is the number of length-9 CIDs (plus comma) that can fit into a URL, minus the approximately 100 other characters for the PUG request to PubChem servers.
    numChunks = np.ceil(len(cidList)/chunkSize)

    requestResults = []

    # Loop miscellanea
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Downloading CID information.' % str(timeStamp))
        bar = ChargingBar('', max = numChunks)
    i = 0 # iterations, number of server requests
    j1 = 0 # index dummy variable
    Tic = tic()

    # Loop
    for num in np.arange(numChunks):
        j0 = j1 # starting position index
        j1 = int(min( len(cidList), (num+1) * chunkSize)) # ending position index
        tempList = cidList[j0:j1]
        cidChunk = ','.join([str(element) for element in tempList])
        if mineType == 'synonyms':
            URL = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+cidChunk+'/synonyms/XML'
        elif mineType == 'properties':
            URL = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+cidChunk+'/property/AtomStereoCount,DefinedAtomStereoCount,UndefinedAtomStereoCount,BondStereoCount,DefinedBondStereoCount,UndefinedBondStereoCount/XML'
        R = requests.get(URL, headers = Headers)
        soup = BeautifulSoup(R.text, 'lxml')
        if mineType == 'synonyms':
            Found = soup.findAll('information')
        elif mineType == 'properties':
            Found = soup.findAll('properties')
        requestResults.extend(Found) # append results for processing later

        # PubChem requires that no more than 5 requests be made per second
        i += 1
        Toc = toc(Tic, mute=True)
        if Toc.total_seconds() <= 1:
            if i == 5:
                time.sleep(1)
                i = 0
                Tic = tic()
                TicSum += Toc
            elif i < 5:
                pass
            elif i > 5:
                print('This isn\'t supposed to happen!')
        elif Toc.total_seconds() > 1:
            i = 1
            Tic = tic()
            TicSum += Toc

        # Progress bar
        if verbose > 0:
            bar.next()
    if verbose > 0:
        bar.finish()
    if verbose > 0:
        Toc = toc(Tic)
        TicSum += Toc
    if verbose > 1:
        alarm()

    # Verbose finish
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Done running \'downloadCidSyns\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum)))

    return requestResults
Ejemplo n.º 4
0
def makeCpiDic(DIR, dataDir, outDir, fname='9606.protein_chemical.links.v5.0.tsv', verbose=0, quickMode=False, quickModeLimit=.1):
    '''
    The code is a clone of makeCidList, but the if-block inside the for loop is different.
    '''
    # verbose start
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Running prototype for \'makeCpiDic\' function.' % str(timeStamp))
        TicSum = datetime.timedelta(0,0,0)
        Tic = tic()
    fpath = dataDir + fname

    # Get progress bar length
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Getting progress bar length.' % str(timeStamp))
        Tic = tic()
    if verbose > 0:
        count = getNumLines(fpath, verbose = 0)
    if verbose > 0:
        Toc = toc(Tic)
        TicSum += Toc

    # Validate quickModeLimit
    if quickMode:
        quickModeLimit = getQuickModeLimit(quickModeLimit, count)

    # Main block
    with open(fpath, 'r') as f:

        # create generator
        if quickMode:
            lineGenerator = (f.readline() for _ in range(quickModeLimit))
        else:
            lineGenerator = (f.readline() for _ in range(count))
        headers=next(lineGenerator)

        # initialize list
        cpiDic = {}
        cpiRDic = {}

        # The loop depends on the file being used.
        # fileOrigin = getFileOrigin(fname) # not implemented
        fileOrigin = 'simpleLinks'

        # Verbose prelude to if block
        if verbose > 0:
            timeStamp = ts()
            print('\n[%s] Splitting lines.' % str(timeStamp))
            Tic = tic()
        if verbose > 0:
            if quickMode:
                bar = ChargingBar('', max = quickModeLimit)
            else:
                bar = ChargingBar('', max = count)
        # If block
        if fileOrigin == 'simpleLinks':
            for line in lineGenerator:
                cid, c2, c3  = line.split()
                try:
                    cpiDic[cid].append(c2)
                except KeyError:
                    cpiDic[cid] = [c2]
                # RDic
                try:
                    cpiRDic[c2].append(cid)
                except KeyError:
                    cpiRDic[c2] = [cid]
                if verbose > 0:
                    bar.next()
        elif fileOrigin == 'detailedLinks':
            for line in lineGenerator:
                cid, c2, c3, c4, c5, c6, c7 = line.split()
                try:
                    cpiDic[cid].append(c2)
                except KeyError:
                    cpiDic[cid] = [c2]
                # RDic
                try:
                    cpiRDic[c2].append(cid)
                except KeyError:
                    cpiRDic[c2] = [cid]
                if verbose > 0:
                    bar.next()
        elif fileOrigin == 'actions':
            for line in lineGenerator:
                a, b, c3, c4, c5, c6 = line.split()
                if isCid(a):
                    try:
                        cpiDic[a].append(b)
                    except KeyError:
                        cpiDic[a] = [b]
                    # RDic
                    try:
                        cpiRDic[b].append(a)
                    except KeyError:
                        cpiRDic[b] = [a]
                elif isCid(b):
                    try:
                        cpiDic[b].append(a)
                    except KeyError:
                        cpiDic[b] = [a]
                    # RDic
                    try:
                        cpiRDic[a].append(b)
                    except KeyError:
                        cpiRDic[a] = [b]
                else:
                    print('This should not be happening!')
                    sys.exit(0)
                if verbose > 0:
                    bar.next()
    if verbose > 0:
        bar.finish()
        Toc = toc(Tic)
        TicSum += Toc

    # Verbose finish
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Done running protoype for \'makeCpiDic\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum)))

    # return statement
    withHeaders = False
    if withHeaders:
        return headers, cpiDic
    else:
        return cpiDic
Ejemplo n.º 5
0
def loadFile(DIR, dataDir, fname, withHeaders=False, verbose=0, quickMode=False, quickModeLimit = 10):
    '''
    Reads tab-delimited file and returns it as a list of lists. Scroll to bottom of Docstring for loading times.

    INPUT:  DIR, string, the directory where the STITCH datasets folder are located.
            verbose, interger. 0 -> no verbose output, any value greater than 0 will print progress feedback, and including a progress bar (if not in quickMode)
            quickMode, boolean. If true, will run a shorter number of iterations as determined by \'quickModeLimit\'.
            quickModeLimit, interger. The number of lines from the file to read. Default 10.
    OUTPUT: array, a Numpy array

    Loading times:
    ============================================================================
    9606.actions.v5.0.tsv
    ---------------------
        Takes 16:20 (mm:ss) to load all 22 million human chemical-protein interactions.
    -------------------------
    protein.aliases.v10.5.txt
    -------------------------
        Takes 48:21 (mm:ss) to load all 48 million protein aliases.
    ============================================================================

    >>> import psutil # or psutils?
    >>> mem = psutil.virtual_memory()
    >>> THRESHOLD = 100 * 1024 * 1024  # 100MB
    >>> if mem.available <= THRESHOLD:
    ...     print("warning")
    '''
    # verbose start
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Running \'loadFile\' function.' % str(timeStamp))
        TicSum = datetime.timedelta(0,0,0)
        Tic = tic()
    fpath = DIR + dataDir + fname

    # Get progress bar length
    if verbose > 0:
        timeStamp = ts()
        print('\n[%s] Getting progres bar length.' % str(timeStamp))
    if not quickMode and verbose:
        with open(fpath) as f:
            lines0 = (f.readline().splitlines()[0] for line in f)
            count = sum(1 for line in lines0)
    if verbose > 0:
        Toc = toc(Tic)
        TicSum += Toc

    # Main block
    with open(fpath) as f:

        # Read file
        if verbose > 0:
            timeStamp = ts()
            print('\n[%s] Reading file.' % str(timeStamp))
        if quickMode:
            lines = (f.readline().splitlines()[0] for line in range(quickModeLimit))
        else:
            lines = (f.readline().splitlines()[0] for line in f)
        headers = next(lines)
        if verbose > 0:
            Toc = toc(Tic)
            TicSum += Toc

        # Split lines
        if verbose > 0:
            timeStamp = ts()
            print('\n[%s] Splitting lines' % str(timeStamp))
            Tic = tic()
        if not quickMode and verbose:
            bar = ChargingBar('', max = count)
        lists = []
        for line in lines:
            row = line.split('\t')
            lists.append(row)
            if not quickMode and verbose:
                bar.next()
        if not quickMode and verbose:
            bar.finish()
        if verbose > 0:
            Toc = toc(Tic)
            TicSum += Toc
            timeStamp = ts()
            print('\n[%s] Done running \'loadFile\' function.\nTotal elapsed time was %s (h:mm:ss)' % (str(timeStamp), str(TicSum)))
        if not quickMode and verbose:
            beep()

    # return statement
    if withHeaders:
        return [[headers]]+lists
    else:
        return lists
Ejemplo n.º 6
0
def testDicCompleteness_full(forwardDic, reverseDic):
    '''
    Docstring
    '''
    # forwardDic
    numEntities = len(forwardDic)

    # Progress bar
    if True:
        timeStamp = ts()
        print('\n[%s] Testing all forward lookup dictionary contents' %
              str(timeStamp))
        count = numEntities
        bar = ChargingBar('', max=count)
        Tic = tic()

    # Test all forward lookup dictionary contents
    forwardMisses = 0
    entityCount = 0
    gate = False
    for entity, list in forwardDic.items():
        entityCount += 1
        for element in list:
            alias = getName(element)
            reverseList = reverseDic[alias]
            numResults = 0
            for element in reverseList:
                resultEntity = getName(element)
                result = resultEntity == entity
                numResults += result
            if numResults == 0:
                forwardMisses += 1
                Input = input('A reverse lookup failed. Show context? y/n')
                if Input.lower() == 'y':
                    print(
                        'While mapping alias %s back to entity %s, the result was instead %s'
                        % (alias, entity, resultEntity))
                Input = input('Exit test? y/n')
                if Input.lower() == 'y':
                    gate = True
                    break
            if gate:
                break
        bar.next()
        if gate:
            break
    bar.finish()
    print('%d look-ups failed.' % forwardMisses)
    print('The forward look-up dictionary had %d entries. %d were tested.' %
          (numEntities, entityCount))

    # reverseDic
    numAliases = len(reverseDic)

    # Progress bar
    if True:
        timeStamp = ts()
        print('\n[%s] Testing all reverse lookup dictionary contents' %
              str(timeStamp))
        count = numAliases
        bar = ChargingBar('', max=count)
        Tic = tic()

    # Test all reverse lookup dictionary contents
    reverseMisses = 0
    aliasCount = 0
    gate = False
    for alias, list in reverseDic.items():
        aliasCount += 1
        for element in list:
            entity = getName(element)
            reverseList = forwardDic[entity]
            numResults = 0
            for element in reverseList:
                resultAlias = getName(element)
                result = resultAlias == alias
                numResults += result
            if numResults == 0:
                reverseMisses += 1
                Input = input('A reverse lookup failed. Show context? y/n')
                if Input.lower() == 'y':
                    print(
                        'While mapping entity %s back to alias %s, the result was instead %s'
                        % (entity, alias, resultAlias))
                Input = input('Exit test? y/n')
                if Input.lower() == 'y':
                    gate = True
                    break
            if gate:
                break
        bar.next()
        if gate:
            break
    bar.finish()
    print('%d reverse look-ups failed.' % reverseMisses)
    print('The reverse look-up dictionary has %d entries. %d were tested.' %
          (numAliases, aliasCount))

    if (forwardMisses + reverseMisses) == 0:
        print(
            'The tested dictionaries are complete in both the forward and backward directions.'
        )
    else:
        print(
            'Warning. The dictionaries failed some forward or reverse lookups.'
        )