Python Fasta Exemples, chipsequtil.Fasta Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : motif_fsa_scores.py Projet : hjanime/OmicsIntegrator

def main():
    usage = "usage: %prog [opts] fasta_file"
    srcdir=os.path.join(progdir,'../src')

    parser=OptionParser(usage)
    
    parser.add_option("--motif", dest="motif",default=os.path.join(progdir,"../data/matrix_files/vertebrates_clustered_motifs.tamo"),help='The .tamo formatted motif file to use for motif matching and scoring')
    parser.add_option('--scores',dest='pkl',default=os.path.join(progdir,'../data/matrix_files/motif_thresholds.pkl'),help='PKL file of matrix score thresholds')
    parser.add_option('--ids',dest='ids',default=os.path.join(progdir,'../data/matrix_files/vertebrates_clustered_motifs_mIDs.txt'),help='List of Exemplar motifs in motif cluster')
    
    parser.add_option('--genemappingfile',dest='gene_file',default='',help='File indicating which regions are mapped to genes, enabling the reduction of the FASTA file for gene-relevant regions')
    parser.add_option("--genome", dest="genome", default='mm9',help='The genome build that you are using, used to estimate binding site priors')
    parser.add_option('--utilpath',dest='addpath',default=srcdir,help='Destination of chipsequtil library, Default=%default')
    parser.add_option('--genelist',dest='genelist',default='',help='List of genes (will select first column if multiple) to include in scan based on --genemappingfile')
    parser.add_option("--outfile", dest="outfile")

#    parser.add_option('--logistic',dest='logistic',action='store_true',default=False,help='Set to true to scale multiple matches into a logistic curve')
    parser.add_option('--threads',dest='threads',type='string',default='4',help='Set number of threads if using logistic scoring')
    parser.add_option('--scale',dest='typ',type='string',default='6')
    (opts, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")
        
    fsa=args[0]
    motiffile=opts.motif
    ##append path to chipsequtil/TAMO
    sys.path.insert(0,opts.addpath)
    global MotifTools
    from chipsequtil import motiftools as MotifTools #import chipsequtil.motiftools as MotifTools
    global Fasta
    from chipsequtil import Fasta
#    sys.path.insert(0,opts.addpath+'chipsequtil')
    
    fsa_dict=Fasta.load(fsa,key_func=lambda x:x)
    if opts.gene_file!='':
        print 'Reducing FASTA file to only contain sequences from '+opts.gene_file
        fsa_dict=reduce_fasta(fsa_dict,opts.gene_file,opts.genelist)


    motif_matrix(fsa_dict,motiffile,opts.outfile,opts.genome,opts.ids,opts.pkl,int(opts.threads),typ=float(opts.typ))

Exemple #2

0

Afficher le fichier

Fichier : get_window_binding_matrix.py Projet : aabaker99/OmicsIntegrator

def build_annotated_tgm(closest_gene_output,distance_to_tss,logistic_score_output,fasta_file,motif_ids,makeWindow=True,tgm_file='',do_pkl=True):
    '''
    Takes existing tgm, and maps to gene names and TF ids within a specific window
    '''
    from chipsequtil import Fasta
    ##get fasta file events, since these are columns in the logistic_score matrix
    seq_ids=Fasta.load(fasta_file,key_func=lambda x: x)

    ##need to get sequence mids in the order they are processed
    ##in the file, this is the index into the score_output file
    ##. ASSUMES GALAXY-formatted FASTA!!!!
    seq_mids=[] ##list of FASTA regions, in their appropriate order in the file
    filtered_events={}##gene name of closest gene to event within window
    for k in seq_ids.keys():
        vals=k.split(';')
        if len(vals)==1:
    	    vals=k.split()
        if ':' in vals[0]: #bed tools used 
            chr,range=vals[0].split(':')
            low,high=range.split('-')
            mid=str(int(low)+((int(high)-int(low))/2))
            seq_mids.append(chr+':'+mid)
        elif 'random' not in vals[0]: #galaxy tools used
            genome,chr,low,high,strand=vals[0].split('_')
            mid=str(int(low)+((int(high)-int(low))/2))
            seq_mids.append(chr+':'+mid)
        
        if len(vals)==3:            
            filtered_events[chr+':'+mid]=vals[2]
    print 'Found %d events, of which %d have gene names'%(len(seq_mids),len(filtered_events))
    ##this next section relies on xls 
    ##filter events that are within distance from closest_gene_output to get gene mapping
    ##
    filtered_fc={}##FC of events within window, in case we want to use in the future

    event_indexes=[] ##

    
 #    ###open the closest_gene_output and determine
#     try:
#         cgo=open(closest_gene_output,'rU').readlines()
#     except:
#         print "Error opening file:", sys.exc_info()[0]
#         print "Check to make sure file exists at %s"%(closest_gene_output)
#         raise
#     inds=cgo[0].strip().split('\t')
#     for row in cgo[1:]:
#         arr=row.strip().split('\t')
#         if 'geneSymbol' in inds: #this is true if we used an xref file
#             gene=arr[inds.index('geneSymbol')]        
# #            mid=arr[2]+':'+str(int(arr[3])+(int(arr[4])-int(arr[3]))/2)
#         else: #otherwise we just gene id
#             gene=arr[inds.index('knownGeneID')]
#         #position mapping is different
#         if 'Position' in inds: #this is for GPS
#             mid='chr'+arr[inds.index('Position')]
#         elif 'chrom' in inds: #this is for BED
#             mid=arr[inds.index('chrom')]+':'+str(int(arr[inds.index('chromStart')])+(int(arr[inds.index('chromEnd')])-int(arr[inds.index('chromStart')]))/2)
#         else: #this is for MACS
#             mid=arr[inds.index('chr')]+':'+str(int(arr[inds.index('start')])+(int(arr[inds.index('end')])-int(arr[inds.index('start')]))/2)

        
#         #print gene,mid
#         dist=arr[inds.index('dist from feature')]
#         try:
#             sv=arr[inds.index('score')]
#         except:
#             try:
#                 sv=arr[inds.index('IPvsCTR')]
#             except:
#                 fc=0.0
#         if sv!='':
#             fc=float(sv)
#         else:
#             next
                
#         #check absolute distance if we're doing a window, or negative distance if we're looking upstream
#         if distance_to_tss=='' or (makeWindow and np.absolute(int(dist))<int(distance_to_tss)) or int(dist)>(-1*int(distance_to_tss)):
# #            filtered_events[mid]=gene #(this was out of if clause, should it be there?) 1/2
#             if mid in seq_mids:
#                 event_indexes.append(seq_mids.index(mid))##index into fasta file value/maps to array
                
#                 ##UPDATE: moved these to within if clause - so that unrelated scores are not included
#                 filtered_events[mid]=gene ##gene name of event
#                 filtered_fc[mid]=float(fc) ##fc value of event
# #            filtered_fc[mid]=float(fc) #see above, 2/2

                
  #  print 'Got '+str(len(filtered_events))+' per-gene events within '+distance_to_tss+' bp window out of '+str(len(cgo))

 #   print 'These map to '+str(len(event_indexes))+' regions in the FASTA file'

    ##get gene ids, or just use mid of sequence region
    gene_names=[t for t in set(filtered_events.values())]
    print gene_names[0:10]

    #get gene ids for all matrices list loaded in
    mi_files=motif_ids.split(',')
    if len(mi_files)>0:
        #open first motif name file that contains names for each element in TAMO file
        all_tf_names=[a.strip() for a in open(mi_files[0],'rU').readlines()]
    if len(mi_files)>1:
        #if we have additional files, check to see if if names already exist
        for i,f in enumerate(mi_files):
            if i==0:
                next
            try:
                #open file and read in extra ids
                newfs=[a.strip() for a in open(f,'rU').readlines()]
            except:
                print "Error opening file:", sys.exc_info()[0]
                print "Check to make sure file exists at %s"%(f)
                raise
               
            if len(newfs)==len(all_tf_names):
                #combine existing tf names with these with . delimiter....
                all_tf_names=['.'.join((a,b)) for a,b in zip(all_tf_names,newfs)]

    ##now go through and clean up TF names
    cleaned_tf_names=[]
    for i,a in enumerate(all_tf_names):
        tfn=set([b for b in a.split('.') if '$' not in b and b!=''])
        if(len(tfn)==0):
            tfn=a.split('.')
#        else:
#            print 'Replacing %s with %s'%(a,'.'.join(tfn))
        cleaned_tf_names.append('.'.join(tfn))

    all_tf_names=cleaned_tf_names
    #print len(cleaned_tf_names)

    
    ##now actually map events to scores
    ##load motif matrix scanning output that maps matrices to regions
    print 'Loading complete motif score file...'
    event_scores=np.loadtxt(logistic_score_output)
    print '\t...Loaded!'
                      
    #create new tgm matrix with approriate file name
    newmat=np.zeros((len(all_tf_names),len(gene_names)),dtype='float')##fill in gene length),dtype='float')
    if makeWindow:
        distance_to_tss=distance_to_tss+'_bpWindow'
    else:
        distance_to_tss=distance_to_tss+'_bpUpstream'

    if tgm_file=='': 
        tgm_file=re.sub('.txt','_'+distance_to_tss+'.tgm',os.path.basename(logistic_score_output))
    if do_pkl:
        pkl_file=re.sub('.tgm','.pkl',tgm_file)
    else:
        pkl_file=''
        
    ##sort event indexes from seq_mids that are in the filtered_events file
    event_indexes.sort()
    
    #populate matrix with greatest score attributed to that gene/tf combo
    for ind,arr in enumerate(event_scores):
        ##name of matrix/motif
        mat=all_tf_names[ind]

        #tfnames=[mat]
        ##here we enumerate which sequences were mapped to a gene within the window
        for k,val in enumerate(seq_mids):#k in event_indexes:
            
            #here we want the event midpoint for the index
#            val=seq_mids[k]
            
            #get score for that index
            score=arr[k]
            
            #now map it to closest gene for that midpoint
            cg=filtered_events[val]

            fc=1.0 ##update this if we want to normalize score by fold change
            score=float(score)*float(fc) ##this should do nothing sine fcgenerally =1

            #if len(tfnames)==1:
            curscore=newmat[all_tf_names.index(mat),gene_names.index(cg)]
            ##updated to include maximum score!!

            if np.abs(score)>np.abs(curscore):
                newmat[all_tf_names.index(mat),gene_names.index(cg)]=score
            #else:
            #    for t in tfnames:
            #        curscore=newmat[all_tf_names.index(t),gene_names.index(cg)]
            #    ##updated to include maximum score!!
            #        if np.abs(float(score))>np.abs(curscore):
            #            newmat[all_tf_names.index(t),gene_names.index(cg)]=float(score)

                
    ###save these intermediate files for debugging purposes
    np.savetxt(tgm_file,newmat)
    gin=re.sub('.tgm','_geneids.txt',tgm_file)
    tin=re.sub('.tgm','_tfids.txt',tgm_file)

    try:
        open(gin,'w').writelines([g+'\n' for g in gene_names])
        open(tin,'w').writelines([t+'\n' for t in all_tf_names])
    except:
        print "Error opening file:", sys.exc_info()[0]
        print "Check to make sure file exists at %s"%(closest_gene_output)
        raise
    
    if pkl_file!='':
        zipcmd='python '+os.path.join(progdir,'zipTgms.py')+' '+tgm_file+' '+tin+' '+gin+' --pkl='+pkl_file
        print 'Compressing matrix file into pkl'
        print zipcmd
        os.system(zipcmd)
        return pkl_file
    else:
        return tgm_file

Exemple #3

0

Afficher le fichier

def build_annotated_tgm(closest_gene_output,
                        distance_to_tss,
                        logistic_score_output,
                        fasta_file,
                        motif_ids,
                        makeWindow=True,
                        tgm_file='',
                        do_pkl=True):
    '''
    Takes existing tgm, and maps to gene names and TF ids within a specific window
    '''
    from chipsequtil import Fasta
    ##get fasta file events, since these are columns in the logistic_score matrix
    seq_ids = Fasta.load(fasta_file)

    ##need to get sequence mids in the order they are processed
    ##in the file, this is the index into the score_output file
    ##. ASSUMES GALAXY-formatted FASTA!!!!
    seq_mids = []
    for k in seq_ids.keys():
        vals = k.split(';')
        if len(vals) == 1:
            vals = k.split(' ')
        #print vals
        if ':' in vals[0]:  #bed tools used
            chr, range = vals[0].split(':')
            low, high = range.split('-')
            mid = str(int(low) + ((int(high) - int(low)) / 2))
            seq_mids.append(chr + ':' + mid)
        elif 'random' not in vals[0]:  #galaxy tools used
            genome, chr, low, high, strand = vals[0].split('_')
            mid = str(int(low) + ((int(high) - int(low)) / 2))
            seq_mids.append(chr + ':' + mid)

    ##this next section relies on xls
    ##filter events that are within distance from closest_gene_output to get gene mapping
    filtered_fc = {
    }  ##FC of events within window, in case we want to use in the future
    filtered_events = {}  ##gene name of closest gene to event within window
    event_indexes = []  ##

    ###open the closest_gene_output and determine
    cgo = open(closest_gene_output, 'rU').readlines()
    inds = cgo[0].strip().split('\t')
    for row in cgo[1:]:
        arr = row.strip().split('\t')
        gene = arr[inds.index('geneSymbol')]

        mid = arr[2] + ':' + str(int(arr[3]) + (int(arr[4]) - int(arr[3])) / 2)
        # print mid
        dist = arr[inds.index('dist from feature')]
        sv = arr[inds.index('score')]
        if sv != '':
            fc = float(sv)
        else:
            next

        #check absolute distance if we're doing a window, or negative distance if we're looking upstream
        if distance_to_tss == '' or (
                makeWindow and np.absolute(int(dist)) < int(distance_to_tss)
        ) or int(dist) > (-1 * int(distance_to_tss)):
            filtered_events[mid] = gene
            if mid in seq_mids:
                event_indexes.append(
                    seq_mids.index(mid))  ##index into fasta file value

            filtered_fc[mid] = float(fc)

    print 'Got ' + str(
        len(filtered_events)
    ) + ' events within ' + distance_to_tss + ' bases out of ' + str(len(cgo))

    print 'Got ' + str(len(event_indexes)) + ' of those events from fasta'

    ##get gene ids, or just use mid of sequence region
    gene_names = [t for t in set(filtered_events.values())]
    #    print gene_names[0:10]

    #get gene ids for all matrices list loaded in
    all_tf_names = [a.strip() for a in open(motif_ids, 'rU').readlines()]
    #    print all_tf_names[0:10]

    ##now actually map events to scores
    ##load motif matrix scanning output that maps matrices to regions
    print 'Loading complete motif score file...'
    event_scores = np.loadtxt(logistic_score_output)
    print 'Loaded motif score file'

    #create new tgm matrix with approriate file name
    newmat = np.zeros((len(all_tf_names), len(gene_names)),
                      dtype='float')  ##fill in gene length),dtype='float')
    if makeWindow:
        distance_to_tss = distance_to_tss + '_bpWindow'
    else:
        distance_to_tss = distance_to_tss + '_bpUpstream'

    if tgm_file == '':
        tgm_file = re.sub('.txt', '_' + distance_to_tss + '.tgm',
                          os.path.basename(logistic_score_output))
    if do_pkl:
        pkl_file = re.sub('.tgm', '.pkl', tgm_file)
    else:
        pkl_file = ''
    ##sort event indexes from seq_mids that are in the filtered_events file
    event_indexes.sort()

    #populate matrix with greatest score attributed to that gene/tf combo
    for ind, arr in enumerate(event_scores):
        ##name of matrix/motif
        mat = all_tf_names[ind]

        tfnames = [mat]
        ##here we enumerate which sequences were mapped to a gene within the window
        for k in event_indexes:
            #here we want the event midpoint for the index
            val = seq_mids[k]

            #get score for that index
            score = arr[k]

            #now map it to closest gene for that midpoint
            cg = filtered_events[val]

            fc = 1.0  ##update this if we want to normalize score by fold change
            score = float(score) * float(
                fc)  ##this should do nothing sine fcgenerally =1

            if len(tfnames) == 1:
                curscore = newmat[all_tf_names.index(mat),
                                  gene_names.index(cg)]
                ##updated to include maximum score!!

                if np.abs(score) > np.abs(curscore):
                    newmat[all_tf_names.index(mat),
                           gene_names.index(cg)] = score
            else:
                for t in tfnames:
                    curscore = newmat[all_tf_names.index(t),
                                      gene_names.index(cg)]
                    ##updated to include maximum score!!
                    if np.abs(float(score)) > np.abs(curscore):
                        newmat[all_tf_names.index(t),
                               gene_names.index(cg)] = float(score)

    ###save these intermediate files for debugging purposes
    np.savetxt(tgm_file, newmat)
    gin = re.sub('.tgm', '_geneids.txt', tgm_file)
    tin = re.sub('.tgm', '_tfids.txt', tgm_file)

    open(gin, 'w').writelines([g + '\n' for g in gene_names])
    open(tin, 'w').writelines([t + '\n' for t in all_tf_names])

    if pkl_file != '':
        zipcmd = 'python ' + os.path.join(
            progdir, 'zipTgms.py'
        ) + ' ' + tgm_file + ' ' + tin + ' ' + gin + ' --pkl=' + pkl_file
        print 'Compressing matrix file into pkl'
        print zipcmd
        os.system(zipcmd)
        return pkl_file
    else:
        return tgm_file

Exemple #4

0

Afficher le fichier

Fichier : get_window_binding_matrix.py Projet : hjanime/OmicsIntegrator

def build_annotated_tgm(closest_gene_output,
                        distance_to_tss,
                        logistic_score_output,
                        fasta_file,
                        motif_ids,
                        makeWindow=True,
                        tgm_file='',
                        do_pkl=True):
    '''
    Takes existing tgm, and maps to gene names and TF ids within a specific window
    '''
    from chipsequtil import Fasta
    ##get fasta file events, since these are columns in the logistic_score matrix
    seq_ids = Fasta.load(fasta_file, key_func=lambda x: x)

    ##need to get sequence mids in the order they are processed
    ##in the file, this is the index into the score_output file
    ##. ASSUMES GALAXY-formatted FASTA!!!!
    seq_mids = [
    ]  ##list of FASTA regions, in their appropriate order in the file
    filtered_events = {}  ##gene name of closest gene to event within window
    for k in seq_ids.keys():
        vals = k.split(';')
        if len(vals) == 1:
            vals = k.split()
        if ':' in vals[0]:  #bed tools used
            chr, range = vals[0].split(':')
            low, high = range.split('-')
            mid = str(int(low) + ((int(high) - int(low)) / 2))
            seq_mids.append(chr + ':' + mid)
        elif 'random' not in vals[0]:  #galaxy tools used
            genome, chr, low, high, strand = vals[0].split('_')
            mid = str(int(low) + ((int(high) - int(low)) / 2))
            seq_mids.append(chr + ':' + mid)

        if len(vals) == 3:
            filtered_events[chr + ':' + mid] = vals[2]
    print 'Found %d events, of which %d have gene names' % (
        len(seq_mids), len(filtered_events))
    ##this next section relies on xls
    ##filter events that are within distance from closest_gene_output to get gene mapping
    ##
    filtered_fc = {
    }  ##FC of events within window, in case we want to use in the future

    event_indexes = []  ##

    #    ###open the closest_gene_output and determine
    #     try:
    #         cgo=open(closest_gene_output,'rU').readlines()
    #     except:
    #         print "Error opening file:", sys.exc_info()[0]
    #         print "Check to make sure file exists at %s"%(closest_gene_output)
    #         raise
    #     inds=cgo[0].strip().split('\t')
    #     for row in cgo[1:]:
    #         arr=row.strip().split('\t')
    #         if 'geneSymbol' in inds: #this is true if we used an xref file
    #             gene=arr[inds.index('geneSymbol')]
    # #            mid=arr[2]+':'+str(int(arr[3])+(int(arr[4])-int(arr[3]))/2)
    #         else: #otherwise we just gene id
    #             gene=arr[inds.index('knownGeneID')]
    #         #position mapping is different
    #         if 'Position' in inds: #this is for GPS
    #             mid='chr'+arr[inds.index('Position')]
    #         elif 'chrom' in inds: #this is for BED
    #             mid=arr[inds.index('chrom')]+':'+str(int(arr[inds.index('chromStart')])+(int(arr[inds.index('chromEnd')])-int(arr[inds.index('chromStart')]))/2)
    #         else: #this is for MACS
    #             mid=arr[inds.index('chr')]+':'+str(int(arr[inds.index('start')])+(int(arr[inds.index('end')])-int(arr[inds.index('start')]))/2)

    #         #print gene,mid
    #         dist=arr[inds.index('dist from feature')]
    #         try:
    #             sv=arr[inds.index('score')]
    #         except:
    #             try:
    #                 sv=arr[inds.index('IPvsCTR')]
    #             except:
    #                 fc=0.0
    #         if sv!='':
    #             fc=float(sv)
    #         else:
    #             next

    #         #check absolute distance if we're doing a window, or negative distance if we're looking upstream
    #         if distance_to_tss=='' or (makeWindow and np.absolute(int(dist))<int(distance_to_tss)) or int(dist)>(-1*int(distance_to_tss)):
    # #            filtered_events[mid]=gene #(this was out of if clause, should it be there?) 1/2
    #             if mid in seq_mids:
    #                 event_indexes.append(seq_mids.index(mid))##index into fasta file value/maps to array

    #                 ##UPDATE: moved these to within if clause - so that unrelated scores are not included
    #                 filtered_events[mid]=gene ##gene name of event
    #                 filtered_fc[mid]=float(fc) ##fc value of event
    # #            filtered_fc[mid]=float(fc) #see above, 2/2

    #  print 'Got '+str(len(filtered_events))+' per-gene events within '+distance_to_tss+' bp window out of '+str(len(cgo))

    #   print 'These map to '+str(len(event_indexes))+' regions in the FASTA file'

    ##get gene ids, or just use mid of sequence region
    gene_names = [t for t in set(filtered_events.values())]
    print gene_names[0:10]

    #get gene ids for all matrices list loaded in
    mi_files = motif_ids.split(',')
    if len(mi_files) > 0:
        #open first motif name file that contains names for each element in TAMO file
        all_tf_names = [a.strip() for a in open(mi_files[0], 'rU').readlines()]
    if len(mi_files) > 1:
        #if we have additional files, check to see if if names already exist
        for i, f in enumerate(mi_files):
            if i == 0:
                next
            try:
                #open file and read in extra ids
                newfs = [a.strip() for a in open(f, 'rU').readlines()]
            except:
                print "Error opening file:", sys.exc_info()[0]
                print "Check to make sure file exists at %s" % (f)
                raise

            if len(newfs) == len(all_tf_names):
                #combine existing tf names with these with . delimiter....
                all_tf_names = [
                    '.'.join((a, b)) for a, b in zip(all_tf_names, newfs)
                ]

    ##now go through and clean up TF names
    cleaned_tf_names = []
    for i, a in enumerate(all_tf_names):
        tfn = set([b for b in a.split('.') if '$' not in b and b != ''])
        if (len(tfn) == 0):
            tfn = a.split('.')


#        else:
#            print 'Replacing %s with %s'%(a,'.'.join(tfn))
        cleaned_tf_names.append('.'.join(tfn))

    all_tf_names = cleaned_tf_names
    #print len(cleaned_tf_names)

    ##now actually map events to scores
    ##load motif matrix scanning output that maps matrices to regions
    print 'Loading complete motif score file...'
    event_scores = np.loadtxt(logistic_score_output)
    print '\t...Loaded!'

    #create new tgm matrix with approriate file name
    newmat = np.zeros((len(all_tf_names), len(gene_names)),
                      dtype='float')  ##fill in gene length),dtype='float')
    if makeWindow:
        distance_to_tss = distance_to_tss + '_bpWindow'
    else:
        distance_to_tss = distance_to_tss + '_bpUpstream'

    if tgm_file == '':
        tgm_file = re.sub('.txt', '_' + distance_to_tss + '.tgm',
                          os.path.basename(logistic_score_output))
    if do_pkl:
        pkl_file = re.sub('.tgm', '.pkl', tgm_file)
    else:
        pkl_file = ''

    ##sort event indexes from seq_mids that are in the filtered_events file
    event_indexes.sort()

    #populate matrix with greatest score attributed to that gene/tf combo
    for ind, arr in enumerate(event_scores):
        ##name of matrix/motif
        mat = all_tf_names[ind]

        #tfnames=[mat]
        ##here we enumerate which sequences were mapped to a gene within the window
        for k, val in enumerate(seq_mids):  #k in event_indexes:

            #here we want the event midpoint for the index
            #            val=seq_mids[k]

            #get score for that index
            score = arr[k]

            #now map it to closest gene for that midpoint
            cg = filtered_events[val]

            fc = 1.0  ##update this if we want to normalize score by fold change
            score = float(score) * float(
                fc)  ##this should do nothing sine fcgenerally =1

            #if len(tfnames)==1:
            curscore = newmat[all_tf_names.index(mat), gene_names.index(cg)]
            ##updated to include maximum score!!

            if np.abs(score) > np.abs(curscore):
                newmat[all_tf_names.index(mat), gene_names.index(cg)] = score
            #else:
            #    for t in tfnames:
            #        curscore=newmat[all_tf_names.index(t),gene_names.index(cg)]
            #    ##updated to include maximum score!!
            #        if np.abs(float(score))>np.abs(curscore):
            #            newmat[all_tf_names.index(t),gene_names.index(cg)]=float(score)

    ###save these intermediate files for debugging purposes
    np.savetxt(tgm_file, newmat)
    gin = re.sub('.tgm', '_geneids.txt', tgm_file)
    tin = re.sub('.tgm', '_tfids.txt', tgm_file)

    try:
        open(gin, 'w').writelines([g + '\n' for g in gene_names])
        open(tin, 'w').writelines([t + '\n' for t in all_tf_names])
    except:
        print "Error opening file:", sys.exc_info()[0]
        print "Check to make sure file exists at %s" % (closest_gene_output)
        raise

    if pkl_file != '':
        zipcmd = 'python ' + os.path.join(
            progdir, 'zipTgms.py'
        ) + ' ' + tgm_file + ' ' + tin + ' ' + gin + ' --pkl=' + pkl_file
        print 'Compressing matrix file into pkl'
        print zipcmd
        os.system(zipcmd)
        return pkl_file
    else:
        return tgm_file

Exemple #5

0

Afficher le fichier

Fichier : get_window_binding_matrix.py Projet : sgosline/garnet-deprecated

def build_annotated_tgm(closest_gene_output,distance_to_tss,logistic_score_output,fasta_file,motif_ids,makeWindow=True,tgm_file='',do_pkl=True):
    '''
    Takes existing tgm, and maps to gene names and TF ids within a specific window
    '''
    from chipsequtil import Fasta
    ##get fasta file events, since these are columns in the logistic_score matrix
    seq_ids=Fasta.load(fasta_file)

    ##need to get sequence mids in the order they are processed
    ##in the file, this is the index into the score_output file
    ##. ASSUMES GALAXY-formatted FASTA!!!!
    seq_mids=[]
    for k in seq_ids.keys():
        vals=k.split(';')
        if len(vals)==1:
    	    vals=k.split(' ')
        #print vals
        if ':' in vals[0]: #bed tools used 
            chr,range=vals[0].split(':')
            low,high=range.split('-')
            mid=str(int(low)+((int(high)-int(low))/2))
            seq_mids.append(chr+':'+mid)
        elif 'random' not in vals[0]: #galaxy tools used
            genome,chr,low,high,strand=vals[0].split('_')
            mid=str(int(low)+((int(high)-int(low))/2))
            seq_mids.append(chr+':'+mid)

    ##this next section relies on xls 
    ##filter events that are within distance from closest_gene_output to get gene mapping
    filtered_fc={}##FC of events within window, in case we want to use in the future
    filtered_events={}##gene name of closest gene to event within window
    event_indexes=[] ##

    ###open the closest_gene_output and determine 
    cgo=open(closest_gene_output,'rU').readlines()
    inds=cgo[0].strip().split('\t')
    for row in cgo[1:]:
        arr=row.strip().split('\t')
        gene=arr[inds.index('geneSymbol')]
        
        mid=arr[2]+':'+str(int(arr[3])+(int(arr[4])-int(arr[3]))/2)
       # print mid
        dist=arr[inds.index('dist from feature')]
        sv=arr[inds.index('score')]
        if sv!='':
            fc=float(sv)
        else:
            next
                
        #check absolute distance if we're doing a window, or negative distance if we're looking upstream
        if distance_to_tss=='' or (makeWindow and np.absolute(int(dist))<int(distance_to_tss)) or int(dist)>(-1*int(distance_to_tss)):
            filtered_events[mid]=gene
            if mid in seq_mids:
                event_indexes.append(seq_mids.index(mid))##index into fasta file value
            
            filtered_fc[mid]=float(fc)

                
    print 'Got '+str(len(filtered_events))+' events within '+distance_to_tss+' bases out of '+str(len(cgo))

    print 'Got '+str(len(event_indexes))+' of those events from fasta'

    ##get gene ids, or just use mid of sequence region
    gene_names=[t for t in set(filtered_events.values())]
#    print gene_names[0:10]

    #get gene ids for all matrices list loaded in
    all_tf_names=[a.strip() for a in open(motif_ids,'rU').readlines()]
#    print all_tf_names[0:10]
    
    ##now actually map events to scores
    ##load motif matrix scanning output that maps matrices to regions
    print 'Loading complete motif score file...'
    event_scores=np.loadtxt(logistic_score_output)
    print 'Loaded motif score file'
                      
    #create new tgm matrix with approriate file name
    newmat=np.zeros((len(all_tf_names),len(gene_names)),dtype='float')##fill in gene length),dtype='float')
    if makeWindow:
        distance_to_tss=distance_to_tss+'_bpWindow'
    else:
        distance_to_tss=distance_to_tss+'_bpUpstream'

    if tgm_file=='': 
        tgm_file=re.sub('.txt','_'+distance_to_tss+'.tgm',os.path.basename(logistic_score_output))
    if do_pkl:
        pkl_file=re.sub('.tgm','.pkl',tgm_file)
    else:
        pkl_file=''
    ##sort event indexes from seq_mids that are in the filtered_events file
    event_indexes.sort()
    
    #populate matrix with greatest score attributed to that gene/tf combo
    for ind,arr in enumerate(event_scores):
        ##name of matrix/motif
        mat=all_tf_names[ind]
        
        tfnames=[mat]
        ##here we enumerate which sequences were mapped to a gene within the window
        for k in event_indexes: 
            #here we want the event midpoint for the index
            val=seq_mids[k]
            
            #get score for that index
            score=arr[k]
            
            #now map it to closest gene for that midpoint
            cg=filtered_events[val]

            fc=1.0 ##update this if we want to normalize score by fold change
            score=float(score)*float(fc) ##this should do nothing sine fcgenerally =1

            if len(tfnames)==1:
                curscore=newmat[all_tf_names.index(mat),gene_names.index(cg)]
                ##updated to include maximum score!!

                if np.abs(score)>np.abs(curscore):
                    newmat[all_tf_names.index(mat),gene_names.index(cg)]=score
            else:
                for t in tfnames:
                    curscore=newmat[all_tf_names.index(t),gene_names.index(cg)]
                ##updated to include maximum score!!
                    if np.abs(float(score))>np.abs(curscore):
                        newmat[all_tf_names.index(t),gene_names.index(cg)]=float(score)

                
    ###save these intermediate files for debugging purposes
    np.savetxt(tgm_file,newmat)
    gin=re.sub('.tgm','_geneids.txt',tgm_file)
    tin=re.sub('.tgm','_tfids.txt',tgm_file)

    open(gin,'w').writelines([g+'\n' for g in gene_names])
    open(tin,'w').writelines([t+'\n' for t in all_tf_names])

    if pkl_file!='':
        zipcmd='python '+os.path.join(progdir,'zipTgms.py')+' '+tgm_file+' '+tin+' '+gin+' --pkl='+pkl_file
        print 'Compressing matrix file into pkl'
        print zipcmd
        os.system(zipcmd)
        return pkl_file
    else:
        return tgm_file