Beispiel #1
0
def get_info_from_fimo_output(fimo_path, writeobj, fimo_dic, 
                              remove_repeats=False):
    '''
    Read fimo output, it's not ordered in any sane way, so we will do it 
    with no shortcuts.
    
    Create a dictionary, keys will be pattern name,
    values will be list of qvalues,
    
    Store into dictionary the following:
        pattern name (easier to retrieve later)
        median qvalue
        n_occurences
    
    Remove repeats: True by default. It will not count rows
    of same pattern name and same sequence name as previous row.
    '''
    # Def colname constants
    pattern_name_colname = '#pattern name'
    qval_colname = 'q-value'
    seq_name_colname = 'sequence name'
    pattern_name_subkey = 'pattern_name'
    qval_med_subkey = 'median_q_value'
    n_occurs_subkey = 'n_motif_occurences'
    
    with open(fimo_path, 'rb') as readfile:
        myreader = csv.reader(readfile, delimiter='\t')
        header = myreader.next()
        for row in myreader:
            '''
            Check every row to make sure seq name and pattern name
            are different. If both are the same, then we don't want to
            include it in our analysis (too many RBPs)
            '''
            seq_name = row[header.index(seq_name_colname)]
            pattern_name = row[header.index(pattern_name_colname)]
            qval = row[header.index(qval_colname)]
            # Create new key for pattern name if it doesnt exist in dic
            # Create qval subkey with empty list.
            if pattern_name not in fimo_dic:
                fimo_dic[pattern_name] = {}
                fimo_dic[pattern_name][qval_colname] = []
                fimo_dic[pattern_name][seq_name_colname] = []
            # Check that seq name does not already exist in our dic.
            # if it is already in our dic, then move to next row.
            if seq_name in fimo_dic[pattern_name][seq_name_colname]:
                continue
            # Append empty list with qvalues from each row...
            fimo_dic[pattern_name][qval_colname].append(qval)
            fimo_dic[pattern_name][seq_name_colname].append(seq_name)
    # Get median q-value, n_motif_occurences and pattern name to dictionary.
    for pat_name, qval_dic in fimo_dic.iteritems():
        # Get pattern name into dictionary info
        fimo_dic[pat_name][pattern_name_subkey] = pat_name
        # calculate med qval
        qval_list = [float(i) for i in qval_dic[qval_colname]]
        med_qval = stats_functions.median(qval_list)
        # Store med qval to dic
        fimo_dic[pat_name][qval_med_subkey] = med_qval
        # Get n_occurences
        fimo_dic[pat_name][n_occurs_subkey] = len(qval_list)
    return fimo_dic
def get_info_from_fimo_output(fimo_path, writeobj, fimo_dic, remove_repeats=False):
    """
    Read fimo output, it's not ordered in any sane way, so we will do it 
    with no shortcuts.
    
    Create a dictionary, keys will be pattern name,
    values will be list of qvalues,
    
    Store into dictionary the following:
        pattern name (easier to retrieve later)
        median qvalue
        n_occurences
    
    Remove repeats: True by default. It will not count rows
    of same pattern name and same sequence name as previous row.
    """
    # Def colname constants
    pattern_name_colname = "#pattern name"
    qval_colname = "q-value"
    seq_name_colname = "sequence name"
    pattern_name_subkey = "pattern_name"
    qval_med_subkey = "median_q_value"
    n_occurs_subkey = "n_motif_occurences"

    with open(fimo_path, "rb") as readfile:
        myreader = csv.reader(readfile, delimiter="\t")
        header = myreader.next()
        for row in myreader:
            """
            Check every row to make sure seq name and pattern name
            are different. If both are the same, then we don't want to
            include it in our analysis (too many RBPs)
            """
            seq_name = row[header.index(seq_name_colname)]
            pattern_name = row[header.index(pattern_name_colname)]
            qval = row[header.index(qval_colname)]
            # Create new key for pattern name if it doesnt exist in dic
            # Create qval subkey with empty list.
            if pattern_name not in fimo_dic:
                fimo_dic[pattern_name] = {}
                fimo_dic[pattern_name][qval_colname] = []
                fimo_dic[pattern_name][seq_name_colname] = []
            # Check that seq name does not already exist in our dic.
            # if it is already in our dic, then move to next row.
            if seq_name in fimo_dic[pattern_name][seq_name_colname]:
                continue
            # Append empty list with qvalues from each row...
            fimo_dic[pattern_name][qval_colname].append(qval)
            fimo_dic[pattern_name][seq_name_colname].append(seq_name)
    # Get median q-value, n_motif_occurences and pattern name to dictionary.
    for pat_name, qval_dic in fimo_dic.iteritems():
        # Get pattern name into dictionary info
        fimo_dic[pat_name][pattern_name_subkey] = pat_name
        # calculate med qval
        qval_list = [float(i) for i in qval_dic[qval_colname]]
        med_qval = stats_functions.median(qval_list)
        # Store med qval to dic
        fimo_dic[pat_name][qval_med_subkey] = med_qval
        # Get n_occurences
        fimo_dic[pat_name][n_occurs_subkey] = len(qval_list)
    return fimo_dic
Beispiel #3
0
def get_info_from_fimo_output2(fimo_path, writeobj, fimo_dic, 
                               convert_to_fraction=False):
    '''
    Second attempt at getting info from fimo.
    I want gene name to show up only ONCE for a given sequence.
    '''
    # Def colname constants
    pattern_name_colname = '#pattern name'
    qval_colname = 'q-value'
    seq_name_colname = 'sequence name'
    pattern_name_subkey = 'pattern_name'
    qval_med_subkey = 'median_q_value'
    n_occurs_subkey = 'n_motif_occurences'
    
    with open(fimo_path, 'rb') as readfile:
        myreader = csv.reader(readfile, delimiter='\t')
        header = myreader.next()
        for row in myreader:
            '''
            Check every row to make sure seq name and pattern name
            are different. If both are the same, then we don't want to
            include it in our analysis (too many RBPs)
            '''
            seq_name = row[header.index(seq_name_colname)]
            pattern_name = row[header.index(pattern_name_colname)]
            qval = row[header.index(qval_colname)]
            '''
            # Get gene name from pattern name, expect 
            string to be: GENENAME,MOTIF,IorD, so we will separate commas.
            '''
            gene_name = pattern_name.split(',')[0]
            
            # Create new key for pattern name if it doesnt exist in dic
            # Create qval subkey with empty list.
            if gene_name not in fimo_dic:
                fimo_dic[gene_name] = {}
                fimo_dic[gene_name][qval_colname] = []
                fimo_dic[gene_name][seq_name_colname] = []
            # Check that seq name does not already exist in our dic.
            # if it is already in our dic, then move to next row.
            if seq_name in fimo_dic[gene_name][seq_name_colname]:
                continue
            # Append empty list with qvalues from each row...
            fimo_dic[gene_name][qval_colname].append(qval)
            fimo_dic[gene_name][seq_name_colname].append(seq_name)
    # Get median q-value, n_motif_occurences and pattern name to dictionary.
    for gene_name, qval_dic in fimo_dic.iteritems():
        # Get pattern name into dictionary info
        fimo_dic[gene_name][pattern_name_subkey] = gene_name
        # calculate med qval
        qval_list = [float(i) for i in qval_dic[qval_colname]]
        med_qval = stats_functions.median(qval_list)
        # Store med qval to dic
        fimo_dic[gene_name][qval_med_subkey] = med_qval
        # Get n_occurences
        # fimo_dic[gene_name][n_occurs_subkey] = len(qval_list)
        n_occurences = len(qval_list)
        if convert_to_fraction == True:
            '''
            # Get n_occurences divided by total fasta inputs
            # Get dirname of fimo txt file, append the html file
            # to read html file.
            '''
            fimo_html_path = os.path.join(os.path.dirname(fimo_path), 'fimo.html')
            n_fasta_seqs = \
                get_number_of_sequences_from_fimohtml(fimo_html_path, rownumber=42)
            fimo_dic[gene_name][n_occurs_subkey] = float(n_occurences) / n_fasta_seqs
        elif convert_to_fraction == False:
            '''
            Just get the occurences, no dividing by fasta seqs
            '''
            fimo_dic[gene_name][n_occurs_subkey] = n_occurences
        else:
            print 'Expected convert to fraction '\
            'to be True or False. %s found.' %convert_to_fraction
    return fimo_dic
def get_info_from_fimo_output2(fimo_path, writeobj, fimo_dic, convert_to_fraction=False):
    """
    Second attempt at getting info from fimo.
    I want gene name to show up only ONCE for a given sequence.
    """
    # Def colname constants
    pattern_name_colname = "#pattern name"
    qval_colname = "q-value"
    seq_name_colname = "sequence name"
    pattern_name_subkey = "pattern_name"
    qval_med_subkey = "median_q_value"
    n_occurs_subkey = "n_motif_occurences"

    with open(fimo_path, "rb") as readfile:
        myreader = csv.reader(readfile, delimiter="\t")
        header = myreader.next()
        for row in myreader:
            """
            Check every row to make sure seq name and pattern name
            are different. If both are the same, then we don't want to
            include it in our analysis (too many RBPs)
            """
            seq_name = row[header.index(seq_name_colname)]
            pattern_name = row[header.index(pattern_name_colname)]
            qval = row[header.index(qval_colname)]
            """
            # Get gene name from pattern name, expect 
            string to be: GENENAME,MOTIF,IorD, so we will separate commas.
            """
            gene_name = pattern_name.split(",")[0]

            # Create new key for pattern name if it doesnt exist in dic
            # Create qval subkey with empty list.
            if gene_name not in fimo_dic:
                fimo_dic[gene_name] = {}
                fimo_dic[gene_name][qval_colname] = []
                fimo_dic[gene_name][seq_name_colname] = []
            # Check that seq name does not already exist in our dic.
            # if it is already in our dic, then move to next row.
            if seq_name in fimo_dic[gene_name][seq_name_colname]:
                continue
            # Append empty list with qvalues from each row...
            fimo_dic[gene_name][qval_colname].append(qval)
            fimo_dic[gene_name][seq_name_colname].append(seq_name)
    # Get median q-value, n_motif_occurences and pattern name to dictionary.
    for gene_name, qval_dic in fimo_dic.iteritems():
        # Get pattern name into dictionary info
        fimo_dic[gene_name][pattern_name_subkey] = gene_name
        # calculate med qval
        qval_list = [float(i) for i in qval_dic[qval_colname]]
        med_qval = stats_functions.median(qval_list)
        # Store med qval to dic
        fimo_dic[gene_name][qval_med_subkey] = med_qval
        # Get n_occurences
        # fimo_dic[gene_name][n_occurs_subkey] = len(qval_list)
        n_occurences = len(qval_list)
        if convert_to_fraction == True:
            """
            # Get n_occurences divided by total fasta inputs
            # Get dirname of fimo txt file, append the html file
            # to read html file.
            """
            fimo_html_path = os.path.join(os.path.dirname(fimo_path), "fimo.html")
            n_fasta_seqs = get_number_of_sequences_from_fimohtml(fimo_html_path, rownumber=42)
            fimo_dic[gene_name][n_occurs_subkey] = float(n_occurences) / n_fasta_seqs
        elif convert_to_fraction == False:
            """
            Just get the occurences, no dividing by fasta seqs
            """
            fimo_dic[gene_name][n_occurs_subkey] = n_occurences
        else:
            print "Expected convert to fraction " "to be True or False. %s found." % convert_to_fraction
    return fimo_dic