Beispiel #1
0
def copyStandardTrack(genome, oldTn, newTn):
    """genome oldTn newTn"""
    oldTn = re.split('/|:', oldTn)
    newTn = re.split('/|:', newTn)
    oldPath = gcf.createOrigPath(genome, oldTn)
    assert os.path.exists(oldPath), 'ERROR: TN did not exist in stdTracks: ' + oldPath
    
    print '(copying track in stdTracks..)'
    newPath = gcf.createOrigPath(genome, newTn)
    assert not os.path.exists(newPath), 'ERROR: Target path already exists: ' + newPath
    qcf.ensurePathExists(newPath)
    shutil.copytree(oldPath, newPath)
Beispiel #2
0
def createAssemblyGapsFile(genome, assemblyChars='ACGTacgt'):
    """genome assemblyChars='ACGTacgt'"""
    basePath = gcf.createOrigPath(genome, GenomeInfo.getPropertyTrackName(genome, 'gaps'),'')
    outFn = basePath + 'assemblyGaps.bed'
    qcf.ensurePathExists(outFn)
    outFile = open(outFn,'w')
    
    seqTrack = PlainTrack( GenomeInfo.getSequenceTrackName(genome) )

    anyGaps = False
    for chr in GenomeInfo.getExtendedChrList(genome):
        chrRegion = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
        seqTV = seqTrack.getTrackView(chrRegion)
        seq = seqTV.valsAsNumpyArray()
        
        #gapIndexes = numpy.arange(len(seq))[(seq == 'n') | (seq == 'N')]
        gapIndexes = numpy.arange(len(seq))[numpy.logical_not( numpy.logical_or.reduce([seq == x for x in assemblyChars]) )]
        gapIndexDiff = gapIndexes[1:] - gapIndexes[:-1]
        gapBeginIndexes = numpy.delete(gapIndexes, (numpy.arange(len(gapIndexDiff)) + 1)[gapIndexDiff==1])
        gapEndIndexes = numpy.delete(gapIndexes + 1, numpy.arange(len(gapIndexDiff))[gapIndexDiff==1])
        
        assert len(gapBeginIndexes) == len(gapEndIndexes)
        
        for i in xrange(len(gapBeginIndexes)):
            anyGaps = True
            outFile.write('\t'.join([chr, str(gapBeginIndexes[i]), str(gapEndIndexes[i])]) + os.linesep)
        
    if not anyGaps:
        outFile.write('\t'.join([GenomeInfo.getExtendedChrList(genome)[0], '1', '1']))
        
    outFile.close()
def getCategorySetForSubTracks(genome, baseTrackName, shelveFn):
    """genome baseTrackName shelveFn"""
    baseTrackName = re.split('/|:', baseTrackName)
    mapping = {}
    for trackName in OrigTrackNameSource(genome, baseTrackName):
        if trackName == baseTrackName:
            continue

        subTrackName = trackName[len(baseTrackName):]

        basePath = gcf.createOrigPath(genome, trackName)
        relFns = [x for x in os.listdir(basePath) if x[0] not in [',']]
        assert len(
            relFns
        ) == 1, 'only tracks with single file is supported, thus not: ' + str(
            relFns)
        fn = basePath + os.sep + relFns[0]
        try:
            categories = list(
                set([
                    line.split()[3] for line in open(fn) if line.strip() != ''
                ]))
        except:
            print 'Error, at filename %s and current line: %s' % (fn, line)
            raise

        mapping[':'.join(subTrackName)] = categories
    shelf = safeshelve.open(shelveFn)
    shelf.update(mapping)
    shelf.close()
def mergeSubtypeTracksToNewCategoryTrack(genome, trackName, newTrackName=None):
    """genome trackName newTrackName"""
    'Used to create a union of the subtype tracks, with the category column (4) used as key. If the same key has different'
    'locations, the first location is used, and the others are printed to standard out. Has been used to create a merged'
    'gene track with all genes from a set of subtypes.'
    'To create a standard categorical track based on subtypes, see instead SubtypesAsCategories in StandardizeTrackFiles'

    trackName = re.split('/|:', trackName)
    newTrackName = re.split('/|:', newTrackName)

    basePath = gcf.createOrigPath(genome, trackName)
    newFn = gcf.createOrigPath(genome,
                               newTrackName) + os.sep + 'merged.category.bed'

    categoryDict = {}
    qcf.ensurePathExists(newFn)
    outF = open(newFn, 'w')
    for subType in os.listdir(basePath):
        subPath = basePath + os.sep + subType
        if not os.path.isdir(subPath):
            continue

        fnPaths = [subPath + os.sep + x for x in os.listdir(subPath)]
        onlyFiles = [
            x for x in fnPaths if os.path.isfile(x) and x.endswith('.bed')
        ]
        #        assert len(onlyFiles)==1, str(onlyFiles)
        for fn in onlyFiles:
            for line in open(fn):
                cols = line.strip().split()
                assert len(cols) > 3

                cat = cols[3]
                if cat in categoryDict:

                    if categoryDict[cat] != [cols[x] for x in [0, 1, 2, 5]]:
                        print cat + ': ' + str(
                            categoryDict[cat]) + ' != ' + str(
                                [cols[x] for x in [0, 1, 2, 5]])
                else:
                    categoryDict[cat] = [cols[x] for x in [0, 1, 2, 5]]
                    outF.write('\t'.join([cols[x] for x in [0, 1, 2, 3]] +
                                         ['0'] + [cols[5]]) + os.linesep)
    outF.close()
Beispiel #5
0
def copyAndFilterSubtypes(genome, trackName, newTrackName, filterFn):
    """genome trackName newTrackName filterFn"""
    trackName, newTrackName  = [re.split('/|:',tn) for tn in [trackName, newTrackName]]
    origPath, newPath = [gcf.createOrigPath(genome, tn,'') for tn in [trackName, newTrackName]]

    filterFile = open(filterFn, 'r')
    filterList = []
    for line in filterFile:
        filterList.append(line.strip().replace('_',' '))
    filterSet = set(filterList)
    filterFile.close()
    
    for subType in os.listdir(origPath):
        if subType.replace('_',' ') in filterSet:
            print 'copying %s' % subType
            shutil.copytree(origPath + subType, newPath + subType, symlinks=True)
Beispiel #6
0
def createChromosomeFile(genome, chromNames, referToCollected=False):
    """genome chromNames"""
    # python quick/extra/CustomFuncCatalog.py CreateChromosomeFile mm9 'chr1, chr2, ...'"
    
    chrList = chromNames.replace(' ','').split(',')
    if referToCollected:
        from gold.util.CommonFunctions import createCollectedPath
        basePath = createCollectedPath(genome, GenomeInfo.getChrTrackName(genome))
    else:
        basePath = gcf.createOrigPath(genome, GenomeInfo.getChrTrackName(genome))

    # Why is this file a category.bed file?
    outFn = basePath + os.sep + 'chromosomes.category.bed'
    qcf.ensurePathExists(outFn)
    print 'Creating: ' + outFn

    outFile = open(outFn, 'w')
    for chr in chrList:
        outFile.write('\t'.join([chr, '0', str(GenomeInfo.getChrLen(genome, chr)), chr]) + os.linesep)
    outFile.close()
Beispiel #7
0
def compareRegulomeWithDirectPubmedTfData(pValThresholdReg, pValThresholdTf, useRowCount):
    "pValThresholdReg pValThresholdTf useRowCount"
    pValThresholdReg = float(pValThresholdReg)
    pValThresholdTf = float(pValThresholdTf)
    assert useRowCount in ['True', 'False']
    
    rowCountRegFn = STATIC_PATH + "/maps/final_tfbs_diseases_binary_average_euc_0.01_rowcount/data/Result_pval_table.txt"
    rowSumRegFn = STATIC_PATH + "/maps/final_tfbs_diseases_binary_average_euc_0.01_rowsum/data/Result_pval_table.txt"

    regFn = rowCountRegFn if ast.literal_eval(useRowCount) else rowSumRegFn

    tfDir = gcf.createOrigPath('hg18', ['Private', 'disease' 'all diseases, hyperg., only TFs'])
    v2tfFn = '/projects/rrresearch/eivindto/v2tf-sort-ok.txt'

    regFile = open(regFn)

    for i in xrange(4):
        regFile.readline()

    tfDiseases = dict([(x.lower(), x) for x in os.listdir(tfDir)])
    regDiseases = [' '.join(x.split()[1:-1]).lower() for x in regFile.readline().strip().split('\t')]

    combDiseases = [tfDiseases[x] for x in regDiseases if x in tfDiseases]
    disease2index = dict([(x,i) for i,x in enumerate(combDiseases)])
    index2disease = dict([(i,x) for i,x in enumerate(combDiseases)])

    v2tf = dict([line.strip().split() for line in open(v2tfFn)])
    tf2v = dict([reversed(line.strip().split()) for line in open(v2tfFn)])
    tf2index = dict([(x,i) for i,x in enumerate(sorted(v2tf.values()))])
    index2tf = dict([(i,x) for i,x in enumerate(sorted(v2tf.values()))])
    
    regTable = numpy.zeros(shape=[len(v2tf), len(combDiseases)], dtype='bool')
    tfTable = numpy.zeros(shape=[len(v2tf), len(combDiseases)], dtype='bool')

    for line in regFile:
        line = line.strip()
        if line == '' or line[0] == '#':
            continue

        cols = line.split('\t')
        pwm = ' '.join(cols[0].split()[1:-1]).lower()
        if pwm not in v2tf:
            continue

        tfIndex = tf2index[v2tf[pwm]]
        for disIndex,x in enumerate(cols[1:]):
            if float(x) < pValThresholdReg:
                regTable[tfIndex][disIndex] = True

    for dis in combDiseases:
        tfFile = open(os.sep.join([tfDir, dis, dis + '.category.bed']))
        for line in tfFile:
            cols = line.strip().split()
            if math.exp(float(cols[4])) < pValThresholdTf:
                if cols[3] in tf2index:
                    tfTable[ tf2index[cols[3]] ][ disease2index[dis] ] = True

    tfIndexes, disIndexes = numpy.where(regTable&tfTable)

    print 'All hits in both:'
    for i in xrange(len(tfIndexes)):
        print index2disease[disIndexes[i]] + '\t' + tf2v[index2tf[tfIndexes[i]]] + '\t' + index2tf[tfIndexes[i]]

    print '#Diseases: %d' % len(combDiseases)
    print '#TFs: %d' % len(v2tf)
    print '#Hits in both: %d' % (regTable&tfTable).sum()
    print '#Hits in both (expected): %f' % ((regTable&~tfTable).sum()*1.0*(~regTable&tfTable).sum()/(~regTable&~tfTable).sum())
    print '#Hits in regulome only: %d' % (regTable&~tfTable).sum()
    print '#Hits in TF & Pubgene only: %d' % (~regTable&tfTable).sum()
    print '#Hits in none: %d' % (~regTable&~tfTable).sum()