Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("-i",
                        "--input",
                        dest="infile",
                        required=True,
                        help="input file")
    parser.add_argument("-n",
                        "--name",
                        dest="name",
                        required=True,
                        help="name of motif")
    args = parser.parse_args()

    name = args.name
    with open(args.infile) as handle:
        m = motifs.read(handle, "pfm")
        pwm = m.counts.normalize(pseudocounts={
            "A": 0.6,
            "C": 0.4,
            "G": 0.4,
            "T": 0.6
        })
        print(">{0}\t{1}".format(str(pwm.consensus), name))
        for i in range(len(pwm[0])):
            print("{0:f}\t{1:f}\t{2:f}\t{3:f}".format(pwm[0][i], pwm[1][i],
                                                      pwm[2][i], pwm[3][i]))
Esempio n. 2
0
def process_data(data, data_type='counts', seq_type='dna'):
    if data_type == 'counts':
        pfm, total = count_to_pfm(data)
        ic = calc_relative_information(pfm, total)
    elif data_type in ['fasta', 'stockholm']:
        #motif, ic = read_alignment(data, data_type, seq_type)
        #pfm = motif.counts.normalize(pseudocounts=1)
        data, total = read_alignment(data, data_type, seq_type)
        pfm, _ = count_to_pfm(data)
        ic = calc_relative_information(pfm, total)
    elif data_type in [
            'alignace', 'meme', 'mast', 'transfac', 'pfm', 'sites', 'jaspar'
    ]:
        if data_type in ['jaspar', 'transfac']:
            motif = motifs.parse(open(data, 'r'), data_type.upper())[0]
            pfm = dict(motif.counts.normalize())
            total = sum(list(motif.counts.values())[0])
        else:
            motif = motifs.read(open(data, 'r'), data_type)
            try:
                pfm = motif.counts.normalize(psuedocounts=1)
            except:
                pfm = motif.counts.normalize()
            total = motif.counts
        ic = calc_relative_information(pfm, total)
    return (format_matrix(pfm), format_matrix(ic))
Esempio n. 3
0
def load_motifs(motif_dir, pseudocounts=0.05, key='full'):
    '''
    read in motifs; motifs have to be in jaspar format as below:
    
        >MA0002.2       RUNX1
        A  [   287    234    123     57      0     87      0     17     10    131    500 ]
        C  [   496    485   1072      0     75    127      0     42    400    463    158 ]
        G  [   696    467    149      7   1872     70   1987   1848    251     81    289 ]
        T  [   521    814    656   1936     53   1716     13     93   1339   1325   1053 ]
    
    Parameters:
        motif_dir: folder that contains motif files; one file for individual motif
        pseudocounts: w.r.t. position weight matrix, the probability adding to every nucleotide
        key: specify the way to name the motifs in the output dictionary
             options: 'full' (default), 'id'
    '''
    motif_dict = {}
    nuc = ['A', 'C', 'G', 'T']
    for mf in os.listdir(motif_dir):
        with open(motif_dir + mf) as f:
            m = motifs.read(f, 'jaspar')
            counts = np.array([m.counts[n] for n in nuc])
            avg_counts = counts.sum(axis=0).mean()
            m.pseudocounts = avg_counts*pseudocounts
            m.background = None
            if key == 'full':
                motif_dict[m.name+'$'+m.matrix_id] = m
            elif key == 'id':
                motif_dict[m.matrix_id] = m
            
    return motif_dict
 def __init__(self,matrixfile=None,pfmdir=None):
     annotationmatrixfile= 'SaccCereAnnotation.txt'
     matrixfile='SaccCereTFMATRIX.txt'
     annotationmatrixfile= path.join('Data', annotationmatrixfile)
     matrixfile= path.join('Data', matrixfile)
     TFmatrixFile= path.join(current_directory, matrixfile)
     annotationfile= path.join(current_directory, annotationmatrixfile)
     medline= open(annotationfile, 'r')
     medlinematrix = csv.reader(medline, dialect='excel-tab')
     self.medline_dict= {}
     for row in medlinematrix:
                 if row[1] == 'medline':
                     self.medline_dict[row[0]]= row[2]  
     if pfmdir is None:
         pfmdir="SaccCerePFMFlatFileDir"
         pfm_folder= path.join(current_directory,'Data', pfmdir)
     else:
         pfm_folder=pfmdir
     TFmatrixReader= open(TFmatrixFile, 'r')
     TFmatrix = csv.reader(TFmatrixReader, dialect='excel-tab')
     self.motif_dict = dict()
     for row in TFmatrix:
         tf_accession= row[0]
         common_name= row[4]
         tf_pfm= "{}.{}.pfm".format(row[2],row[3])
         filename= path.join('Data',pfm_folder, tf_pfm)
         with open(filename, 'r') as handle:
             self.motif_dict[common_name] = motifs.read(handle,'pfm')
             try:
                 sgdid= idconverter.getgene(common_name).SGDID
                 self.motif_dict[sgdid]= self.motif_dict.pop(common_name)
                 self.motif_dict[sgdid].medline= self.medline_dict[tf_accession]
             except Exception:
                 self.motif_dict.pop(common_name, None)
                 continue 
def jaspar_to_pwm(output_dir="./"):
    """
    For each taxon, this function reformats all profiles from JASPAR to
    PWMScan format.
    """

    # Initialize
    # perl_script = os.path.join(os.path.dirname(os.path.realpath(__file__)),
    #     "jasparconvert.pl")

    # For each taxon...
    for taxon in taxons:

        # Initialize
        taxon_dir = os.path.join(os.path.abspath(output_dir), taxon)

        # For each profile...
        for f in os.listdir(taxon_dir):

            # Skip non-JASPAR profiles
            if not f.endswith(".jaspar"):
                continue

            # JASPAR to PWMScan
            with open(os.path.join(taxon_dir, f)) as handle:
                m = motifs.read(handle, "jaspar")
            m.pseudocounts = motifs.jaspar.calculate_pseudocounts(m)
            pwm = list(map(list, zip(*[m.pssm[nt] for nt in "ACGT"])))
            pwm_file = os.path.join(taxon_dir, f"{f[:8]}.pwm")
            if not os.path.exists(pwm_file):
                with open(pwm_file, "w") as handle:
                    for i in pwm:
                        s = " ".join(
                            ["{:7d}".format(round(j * 100)) for j in i])
                        handle.write("%s\n" % s)
Esempio n. 6
0
def motif2pssm(path2motif, format):
    if format == "ppm":
        ppm = np.loadtxt(path2motif)
        print "PPM:"
        print ppm
        print ""

        pssm = np.log2((ppm + 1E-9) / 0.25)
        return pssm
    else:
        with open(path2motif) as handle:
            m = motifs.read(handle, format)

            pfm = m.counts
            print "PFM:"
            print pfm

            ppm = pfm.normalize(pseudocounts=C_PSEUDOCOUNTS)
            print "PPM:"
            print ppm

            pssm = ppm.log_odds(background=C_BACKGROUND)
            np_pssm = np.zeros(shape=(4, pssm.length))
            for i, nt in enumerate(['A', 'C', 'G', 'T']):
                np_pssm[i] = pssm[nt]
            return np_pssm
Esempio n. 7
0
def read_motif(motif):
    """
    motif: a single .fm file within the directory.
    
    Returns a motif.jaspar.Motif (the motif matrix) from the .fm file.
    """
    return motifs.read(open(motif), "pfm")
Esempio n. 8
0
def readMotifFile(motifPath):
    with open(motifPath) as f:
        m = motifs.read(f, 'jaspar')
    name = m.name
    matrix = [m.pwm['A'], m.pwm['C'], m.pwm['G'], m.pwm['T']]
    matrix = np.array(matrix).T

    return (name, np.array(matrix), m.matrix_id)
Esempio n. 9
0
    def __init__(self, input_file_name, pseudocounts, precision, fpr,
                 thresholds):
        """ 
        Initializes Motif.

        Variables:
        pfm -- Position Frequency Matrix.
        pwm -- Position Weight Matrix.
        pssm -- Position Specific Scoring Matrix.
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing error handler
        err = ErrorHandler()

        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])
        repository = input_file_name.split("/")[-2]

        # Creating PFM & PWM
        input_file = open(input_file_name, "r")
        self.pfm = motifs.read(input_file, "pfm")
        self.pwm = self.pfm.counts.normalize(pseudocounts)
        input_file.close()
        self.len = len(self.pfm)

        # Creating PSSM
        background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
        self.pssm = self.pwm.log_odds(background)
        self.pssm_list = [self.pssm[e] for e in ["A", "C", "G", "T"]]
        self.max = self.pssm.max

        # Evaluating threshold
        try:
            if pseudocounts != 0.1 or precision != 10000:
                raise ValueError()
            self.threshold = thresholds.dict[repository][self.name][fpr]
        except Exception:
            err.throw_warning(
                "DEFAULT_WARNING",
                add_msg="Parameters not matching pre-computed Fpr data. "
                "Recalculating (might take a while)..")
            try:
                distribution = self.pssm.distribution(background=background,
                                                      precision=precision)
            except Exception:
                err.throw_error("MM_PSEUDOCOUNT_0")
            self.threshold = distribution.threshold_fpr(fpr)

        # Evaluating if motif is palindromic
        if str(self.pfm.consensus) == str(
                self.pfm.consensus.reverse_complement()):
            self.is_palindrome = True
        else:
            self.is_palindrome = False
Esempio n. 10
0
 def yield_motifs(path):
     with open(path) as handle:
         for key, lines in groupby(handle, methodcaller('startswith', '>')):
             if key:
                 name = lines.next().strip().split()[-1].lower()
             else:
                 tmp = ''.join(lines)
                 mot = motifs.read(StringIO(tmp), 'pfm')
                 yield name, mot
                 yield name+'-R', true_motif_rev_complement(mot)
Esempio n. 11
0
def MotifToBP(motif,name):
    motifStr = '>' + name + '\n'
    motifStr += 'A ' + str(motif['PWM']['A']).replace(',','') + '\n'
    motifStr += 'C ' + str(motif['PWM']['C']).replace(',','') + '\n'
    motifStr += 'G ' + str(motif['PWM']['G']).replace(',','') + '\n'
    motifStr += 'T ' + str(motif['PWM']['T']).replace(',','') + '\n'

    handle = StringIO(motifStr)
    motif = motifs.read(handle, 'jaspar')
    return motif
def pearsonpwm(
        pwm1,
        pwm2):  #this function computes the Pearson coefficient between 2 pwm
    cisbpmat = motifs.read(open(pwm1), "pfm")
    cisbpmat.pseudocounts = 3.0
    pwmnp1 = (np.loadtxt(pwm1, skiprows=1))
    tf1 = cisbpmat.pssm
    cisbpmat2 = motifs.read(open(pwm2), "pfm")
    cisbpmat2.pseudocounts = 3.0
    pwmnp2 = (np.loadtxt(pwm2, skiprows=1))
    tf2 = cisbpmat2.pssm
    distance, offset = tf2.dist_pearson(tf1)
    return [pwm1, 1 - distance,
            math.fabs(offset),
            len(np.transpose(pwmnp1))], [
                pwm2, 1 - distance,
                math.fabs(offset),
                len(np.transpose(pwmnp2))
            ]
Esempio n. 13
0
    def pfm(self):
        s = ''
        for l in ['a', 'c', 'g', 't']:
            for n in self.PFM[l]:
                s += str(n) + ' '
            s += '\n'

        sio = StringIO(s)
        m = motifs.read(sio, 'pfm')

        return m
Esempio n. 14
0
 def __init__(self, input_file_name):
   input_file = open(input_file_name,"r")
   self.pfm = motifs.read(input_file, "pfm")
   self.pwm = self.pfm.counts.normalize(0.0001)
   input_file.close()
   self.len = len(self.pfm)
   background = {'A':0.25,'C':0.25,'G':0.25,'T':0.25}
   self.pssm = self.pwm.log_odds(background)
   self.pssm_list = [self.pssm[e] for e in ["A","C","G","T"]]
   self.max = self.pssm.max
   self.min = self.pssm.min
Esempio n. 15
0
def motif_reader(path_name):
    motif_list = []
    for filename in os.listdir(path_name):
        print(filename)
        with open(path_name + filename) as handle:
            word = motifs.read(handle, "pfm")
            handle.close()
        motif = str(word.consensus)
        print(motif)
        motif_list.append(motif)

    return motif_list
Esempio n. 16
0
def read_jaspar_motif_file(motifPath, pseudocount):
    '''
    reads jaspar motif file
    inputs: path to a jaspar motif file
    outputs: a tuple representing a motif
    '''
    with open(motifPath) as f:
        m = motifs.read(f, 'jaspar')
        default_pseudocount = motifs.jaspar.calculate_pseudocounts(m)
        scaled_pseudocount = pseudocount/0.01 * default_pseudocount['A']
        m.pseudocounts = int(scaled_pseudocount)
    return (m.name, m)
Esempio n. 17
0
    def __init__(self, input_file_name, pseudocounts, precision, fpr, thresholds):
        """ 
        Initializes Motif.

        Variables:
        pfm -- Position Frequency Matrix.
        pwm -- Position Weight Matrix.
        pssm -- Position Specific Scoring Matrix.
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing error handler
        err = ErrorHandler()
 
        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])
        repository = input_file_name.split("/")[-2]

        # Creating PFM & PWM
        input_file = open(input_file_name, "r")
        self.pfm = motifs.read(input_file, "pfm")
        self.pwm = self.pfm.counts.normalize(pseudocounts)
        input_file.close()
        self.len = len(self.pfm)

        # Creating PSSM
        background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
        self.pssm = self.pwm.log_odds(background)
        self.pssm_list = [self.pssm[e] for e in ["A", "C", "G", "T"]]
        self.max = self.pssm.max

        # Evaluating threshold
        try:
            if pseudocounts != 0.1 or precision != 10000:
                raise ValueError()
            self.threshold = thresholds.dict[repository][self.name][fpr]
        except Exception:
            err.throw_warning("DEFAULT_WARNING", add_msg="Parameters not matching pre-computed Fpr data. "
                                                         "Recalculating (might take a while)..")
            try:
                distribution = self.pssm.distribution(background=background, precision=precision)
            except Exception:
                err.throw_error("MM_PSEUDOCOUNT_0")
            self.threshold = distribution.threshold_fpr(fpr)

        # Evaluating if motif is palindromic
        if str(self.pfm.consensus) == str(self.pfm.consensus.reverse_complement()):
            self.is_palindrome = True
        else:
            self.is_palindrome = False
def read_pfm(filename):
    """Facilitates readings a Bio.motif object with set parameters. The output is a Bio.motif object that can quickly be trasnformed to a 
	PWM, or a PSSM using the associated arguments (.pwm , .pssm). 

	The expected PFM file format can be found in 'docs/ex_pfmfile.txt' 
	Additional information can be found on the Biopython motifs page"""

    with open(filename, "r") as handle:
        motif = motifs.read(handle, "pfm")
    motif.pseudocounts = .25
    motif.background = {'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3}

    return motif
Esempio n. 19
0
  def __init__(self, input_file_name):

    # Standardize input file to be only the nucleotide frequencies

    input_file = open(input_file_name,"r")
    self.pfm = motifs.read(input_file, "pfm")
    self.pwm = self.pfm.counts.normalize(0.1)
    input_file.close()
    self.len = len(self.pfm)
    background = {'A':0.25,'C':0.25,'G':0.25,'T':0.25}
    self.pssm = self.pwm.log_odds(background)
    self.pssm_list = [self.pssm[e] for e in ["A","C","G","T"]]
    self.max = self.pssm.max
    self.min = self.pssm.min
Esempio n. 20
0
    def get_motifs(self, transcription_factor):
        motifs = []
        # Try Jaspar
        JASPAR_dir = '../data/preprocess/JASPAR/'
        for f in os.listdir(JASPAR_dir):
            if transcription_factor.upper() in f.upper():
                with open(os.path.join(JASPAR_dir, f)) as handle:
                    motif = read(handle, 'pfm')
                    print "motif found in JASPAR", f
                    motifs.append(motif)

        # Try SELEX
        SELEX_dir = '../data/preprocess/SELEX_PWMs_for_Ensembl_1511_representatives/'
        for f in os.listdir(SELEX_dir):
            if f.upper().startswith(transcription_factor.upper()):
                with open(os.path.join(SELEX_dir, f)) as handle:
                    motif = read(handle, 'pfm')
                    print "motif found in SELEX", f
                    motifs.append(motif)

        # Try Factorbook

        return motifs
Esempio n. 21
0
def read_motif(motif_filename, verb=0):
    """Reads a motif as a collection of sites from a file

    Reads a motif and uses the biopython.motifs class to store it. If the motif
    is in FASTA format, it uses the parser directly. Otherwise, it loads and 
    reads a concatenated text file and creates the motif.
    
    File type is determined by extension: 
        * FASTA for .fas, .fasta and .fa files
        * One-per-line text file otherwise
    
    Input:
        * The motif filename; required
        * Verbose mode (default=0)
    Returns:
        * the read motif
    """

    #create file handler for reading file
    try:
        motif_file = open(motif_filename, "r")
    except (IOError, OSError) as file_open_exception:
        print "*** The file name provided:", motif_filename, " does not exist"
        print "*** Error: ", file_open_exception.errno, " - ",\
                             file_open_exception.strerror
        sys.exit()

    #Figure out file type based on extension, read sites and create motif
    extension = motif_filename.split('.')[-1]

    if extension not in ['fas', 'fasta', 'fa']:
        if verb:
            print 'Reading motif... raw text sequence mode assumed \
                        (one site per line, not FASTA parsing)'

        sites = []
        for line in motif_file:
            sites.append(Seq(line.rstrip('\n\r'), IUPAC.unambiguous_dna))
        mot = motifs.create(sites)
        if verb: print mot.degenerate_consensus
    else:
        if verb: print 'Reading motif... attempting to parse FASTA file'
        mot = motifs.read(motif_file, 'sites')

    motif_file.close()
    return mot
    def get_observed_result(self, protein):
        """ Find the DNA binding motif for a given protein

        Args:
            protein (:obj:`models.ProteinSubunit`): protein subunit to find data for

        Returns:
            :obj:`list` of :obj:`data_model.Observable`: list of observables

        """
        versions = self.get_DNA_by_protein(protein)

        index = 0
        observed_result = []
        for motif in versions:
            binding_matrix = []
            for position in motif.all():
                binding_matrix.append([
                    position.frequency_a, position.frequency_c,
                    position.frequency_g, position.frequency_t
                ])
            binding_matrix = map(list, zip(*binding_matrix))
            self.cache_dirname = tempfile.mkdtemp()
            with open(self.cache_dirname + '/data.pfm', 'w') as pfm:
                for items in binding_matrix:
                    writer = csv.writer(pfm, delimiter='\t')
                    writer.writerow(items)

            m = motifs.read(open(self.cache_dirname + '/data.pfm'), 'pfm')
            dna_specie = data_model.DnaSpecie(binding_matrix=m.counts,
                                              sequence=str(m.counts.consensus))

            metadata = self.metadata_dump(motif.all()[0].dataset)
            observed_result.append(
                data_model.ObservedSpecie(specie=dna_specie,
                                          metadata=metadata))

            for position in motif.all():
                observed_result[index].specie.cross_references = data_model.Resource(namespace ='pubmed',\
                id = position.dataset._metadata.resource[0]._id),
                break

            shutil.rmtree(self.cache_dirname)
            index += 1

        return observed_result
    def get_protein_by_DNA_sequence(self,
                                    sequence,
                                    select=models.ProteinSubunit):
        """

        NOTE: Currently there are no Gene objects in common schema models. When added this
        query will be updated to input models.Gene and output data_model.ProteinSpecie


        Args:
            sequence (:obj:`data_model.DnaSpecie.sequence`): sequence of DNA segment

        Returns:
            :obj:`list` of :obj:`tuple`: Returns the query for a protein, sequence position, and score

        """
        #TODO: Make more efficient. Add gene location filter

        ans = []

        size = len(sequence)
        all_matricies = self.data_source.session.query(
            models.DNABindingDataset).all()
        all_sequences = []
        for matricies in all_matricies:
            if len(matricies.dna_binding_data) == size:
                binding_matrix = []
                for position in matricies.dna_binding_data:
                    binding_matrix.append([
                        position.frequency_a, position.frequency_c,
                        position.frequency_g, position.frequency_t
                    ])
                binding_matrix = map(list, zip(*binding_matrix))
                self.cache_dirname = tempfile.mkdtemp()
                with open(self.cache_dirname + '/data.pfm', 'w') as pfm:
                    for items in binding_matrix:
                        writer = csv.writer(pfm, delimiter='\t')
                        writer.writerow(items)
                m = motifs.read(open(self.cache_dirname + '/data.pfm'), 'pfm')
                my_seq = Seq(sequence, IUPAC.unambiguous_dna)
                ##TODO: Include selective threshold
                # distribution = m.pssm.distribution().threshold_paster()
                # print distribution.threshold_paster()
                for position, score in m.pssm.search(my_seq, threshold=2):
                    ans.append((matricies.subunit, position, score))
        return ans
def read_motif(motif_filename, verb=0):
    """Reads a motif as a collection of sites from a file

    Reads a motif and uses the biopython.motifs class to store it. If the motif
    is in FASTA format, it uses the parser directly. Otherwise, it loads and 
    reads a concatenated text file and creates the motif.
    
    File type is determined by extension: 
        * FASTA for .fas, .fasta and .fa files
        * One-per-line text file otherwise
    
    Input:
        * The motif filename; required
        * Verbose mode (default=0)
    Returns:
        * the read motif
    """


    #create file handler for reading file
    try:
        motif_file = open(motif_filename,"r")
    except (IOError, OSError) as file_open_exception:
        print "*** The file name provided:", motif_filename, " does not exist"
        print "*** Error: ", file_open_exception.errno, " - ",\
                             file_open_exception.strerror       
        sys.exit()

    #Figure out file type based on extension, read sites and create motif
    extension = motif_filename.split('.')[-1]

    if extension not in ['fas', 'fasta', 'fa']:
        if verb: print 'Reading motif... raw text sequence mode assumed \
                        (one site per line, not FASTA parsing)'
        sites = []
        for line in motif_file:
            sites.append(Seq(line.rstrip('\n\r'),IUPAC.unambiguous_dna))
        mot = motifs.create(sites)
        if verb: print mot.degenerate_consensus
    else:
        if verb: print 'Reading motif... attempting to parse FASTA file'
        mot = motifs.read(motif_file,'sites')
        
    motif_file.close()
    return mot
Esempio n. 25
0
def _get_profiles(profiles_dir, latest=False, profile=[], taxon=taxons):

    # Initialize
    profiles = []
    profiles_dict = {}

    # For each taxon...
    for t in taxon:

        # Initialize
        taxon_dir = os.path.join(os.path.abspath(profiles_dir), t)

        # For each profile...
        for profile_file in sorted(os.listdir(taxon_dir), reverse=True):

            # Skip wrong profiles
            if len(profile) > 0:
                if profile_file[:8] not in profile:
                    continue

            # Load profile
            with open(os.path.join(taxon_dir, profile_file)) as f:
                p = motifs.read(f, "jaspar")

            # Initialize key
            key = profile_file[:6]
            profiles_dict.setdefault(key, [])

            # Skip profile if only using the latest version of each profile
            if latest:
                if len(profiles_dict[key]) == 1:
                    continue

            # Add profile
            profiles_dict[key].append(p)

    # Create list of profiles
    for value_list in profiles_dict.values():
        for p in value_list:
            profiles.append(p)

    return (profiles)
Esempio n. 26
0
    def parse_cisBP_pwm(self):
        pwms_original_dir = os.path.join(self.cisBP_rna_dir, "pwms_all_motifs")
        pwms_jaspar_dir = os.path.join(self.cisBP_rna_dir, "pwms_all_motifs",
                                       "jaspar")
        pwms_info = os.path.join(self.cisBP_rna_dir,
                                 "RBP_information_all_motifs.txt")
        df = pd.read_csv(pwms_info, sep='\t', header=0)

        if not os.path.isdir(pwms_jaspar_dir):
            os.mkdir(pwms_jaspar_dir)

        if len(os.listdir(pwms_jaspar_dir)) == 0:
            self.create_jaspar_files(pwms_original_dir, pwms_jaspar_dir)

        for file in os.listdir(pwms_jaspar_dir):
            with open(os.path.join(pwms_jaspar_dir, file)) as handle:
                motif_ID = os.path.splitext(file)[0]
                pwm = motifs.read(handle, "pfm")
                if not motif_ID in self.pwms_dict:
                    self.pwms_dict[motif_ID] = pwm
            handle.close()
Esempio n. 27
0
    def from_fasta(fasta, motifid, name=None):
        """
		Create motif from fasta.
		Will use captital letters as motif sites (see JASPAR sites format).

		Parameters:
			fasta (string): Path to fasta file.

			motifid (string): Unique id of the motif.

			name (string): Name of the motif. Defaults to 'None'.

		Returns:
			OneMotif object
		"""
        with open(fasta) as handle:
            motif = motifs.read(handle, "sites")

        return OneMotif(
            motifid=motifid,
            counts=[motif.counts[base] for base in ["A", "C", "G", "T"]],
            name=name)
def get_names(output_dir="./"):
    """
    This function extracts the name of each JASPAR profile and saves them
    in a JSON file.
    """

    # Initialize
    names = {}

    # Skip if already done
    json_file = os.path.join(output_dir, "names.json")
    if not os.path.exists(json_file):

        # For each taxon...
        for taxon in taxons:

            # Initialize
            taxon_dir = os.path.join(os.path.abspath(output_dir), taxon)

            # For each profile...
            for f in os.listdir(taxon_dir):

                # Skip non-JASPAR profiles
                if not f.endswith(".jaspar"):
                    continue

                # Get profile name
                with open(os.path.join(taxon_dir, f)) as handle:
                    m = motifs.read(handle, "jaspar")
                if m.name.startswith(m.matrix_id):
                    name = m.name[len(m.matrix_id) + 1:]
                else:
                    name = m.name
                names.setdefault(m.matrix_id, name)

        # Write JSON
        with open(json_file, "w") as handle:
            json.dump(names, handle, sort_keys=True, indent=4)
Esempio n. 29
0
def line_plot(arguments):
    (mpbs_name, num_fp, signal_1, signal_2, factor1, factor2, condition1, condition2,
     pwm_dict, output_location, window_size, standardize) = arguments

    mpbs_name = mpbs_name.replace("(", "_")
    mpbs_name = mpbs_name.replace(")", "")
    mean_signal_1 = (signal_1 / num_fp) / factor1
    mean_signal_2 = (signal_2 / num_fp) / factor2

    # output signal
    signal_fname = os.path.join(output_location, "{}.txt".format(mpbs_name))
    with open(signal_fname, "w") as f:
        f.write(condition1 + "\t" + condition2 + "\n")
        for i in range(window_size):
            f.write(str(mean_signal_1[i]) + "\t" + str(mean_signal_2[i]) + "\n")

    if standardize:
        mean_signal_1, mean_signal_2 = standard(mean_signal_1, mean_signal_2)

    # Output PWM and create logo
    pwm_fname = os.path.join(output_location, "{}.pwm".format(mpbs_name))
    pwm_file = open(pwm_fname, "w")
    for e in ["A", "C", "G", "T"]:
        pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n")
    pwm_file.close()

    logo_fname = os.path.join(output_location, "{}.logo.eps".format(mpbs_name))
    pwm = motifs.read(open(pwm_fname), "pfm")
    pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line=str(window_size),
                color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="",
                show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="",
                show_fineprint=False, show_ends=False)

    start = -(window_size / 2)
    end = (window_size / 2) - 1
    x = np.linspace(start, end, num=window_size)

    plt.close('all')
    fig, ax = plt.subplots()
    ax.plot(x, mean_signal_2, color='red', label=condition2)
    ax.plot(x, mean_signal_1, color='blue', label=condition1)
    ax.text(0.15, 0.9, 'n = {}'.format(num_fp), verticalalignment='bottom', horizontalalignment='right',
            transform=ax.transAxes, fontweight='bold')

    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_position(('outward', 15))
    ax.tick_params(direction='out')
    ax.set_xticks([start, 0, end])
    ax.set_xticklabels([str(start), 0, str(end)])
    min_signal = min(min(mean_signal_1), min(mean_signal_2))
    max_signal = max(max(mean_signal_1), max(mean_signal_2))
    ax.set_yticks([min_signal, max_signal])
    ax.set_yticklabels([str(round(min_signal, 2)), str(round(max_signal, 2))], rotation=90)

    ax.set_title(mpbs_name, fontweight='bold')
    ax.set_xlim(start, end)
    ax.set_ylim([min_signal, max_signal])
    ax.legend(loc="upper right", frameon=False)
    ax.spines['bottom'].set_position(('outward', 70))

    figure_name = os.path.join(output_location, "{}.line.eps".format(mpbs_name))
    fig.tight_layout()
    fig.savefig(figure_name, format="eps", dpi=300)

    # Creating canvas and printing eps / pdf with merged results
    output_fname = os.path.join(output_location, "{}.eps".format(mpbs_name))

    c = pyx.canvas.canvas()
    c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
    c.insert(pyx.epsfile.epsfile(0.45, 0.8, logo_fname, width=16.5, height=3))
    c.writeEPSfile(output_fname)
    os.system(" ".join(["epstopdf", output_fname]))

    os.remove(figure_name)
    os.remove(logo_fname)
    os.remove(output_fname)
    os.remove(pwm_fname)
Esempio n. 30
0
output_logos_dir = path.join(curr_dir, "logos")
if not path.exists(output_logos_dir):
    mkdir(output_logos_dir)
for dir_name, subdir_list, file_list in walk(path.join(curr_dir, "motifs")):
    base_name = path.basename(dir_name)
    if ((options.hocomoco and base_name == "hocomoco") or
        (options.jaspar_vertebrates and base_name == "jaspar_vertebrates")
            or (options.uniprobe_primary and base_name == "uniprobe_primary")
            or
        (options.uniprobe_secondary and base_name == "uniprobe_secondary")):
        output_dir = path.join(curr_dir, "logos", base_name)
        if not path.exists(output_dir):
            mkdir(output_dir)
        else:
            continue
        print("Creating logos for " + base_name)
        for pwm_file_name in file_list:
            pwm_full_file_name = path.join(dir_name, pwm_file_name)
            if pwm_file_name.split(".")[-1] != "pwm":
                continue
            pwm_file = open(pwm_full_file_name, "r")
            logo_file_name = path.join(
                output_dir, ".".join(pwm_file_name.split(".")[:-1]) + ".png")
            pwm = motifs.read(pwm_file, "pfm")
            pwm.weblogo(logo_file_name,
                        format="png_print",
                        stack_width="medium",
                        color_scheme="color_classic")
            pwm_file.close()
        print("OK")
Esempio n. 31
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Esempio n. 32
0
  #threshold= math.pow(10,-float(sys.argv[2])) # P-val, 5 means -log(n,10)
  out = open(out_file, 'w')
  out.write("Sequence\tMotif\tMotif_consensus\tHit_Seq\tHit_position\tHit_score\tThreshold\n")

  # Specify if you only want to include certain pwm's in the directory
  if include == "all":
    include_if = os.listdir(f)
  else:
    include_if = open(include, 'r').read().splitlines()


  if f[-4:] == ".pwm":
    #Make PSSM for mapping
    motif_name = f.strip().split("/")[-1]
    m = motifs.read(open(f), "pfm")
    motif_len = len(m)
    pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p)
    hits = mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len)

  else:
    for motif_file in os.listdir(f):
      if motif_file in include_if:
        motif_name = motif_file.strip().split("/")[-1]
        print(motif_name)
        path = f + motif_file
        m = motifs.read(open(path), "pfm")
        motif_len = len(m)
        pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p)
        hits = mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len)
      else:
Esempio n. 33
0
    def line(self):
        signal = GenomicSignal(self.bam_file)
        signal.load_sg_coefs(slope_window_size=9)
        bias_table = BiasTable()
        bias_table_list = self.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])
        genome_data = GenomeData(self.organism)
        fasta = Fastafile(genome_data.get_genome())
        pwm_dict = dict([("A", [0.0] * self.window_size), ("C", [0.0] * self.window_size),
                        ("G", [0.0] * self.window_size), ("T", [0.0] * self.window_size),
                        ("N", [0.0] * self.window_size)])


        mean_raw_signal = np.zeros(self.window_size)
        mean_bc_signal = np.zeros(self.window_size)
        mean_raw_signal_f = np.zeros(self.window_size)
        mean_bc_signal_f = np.zeros(self.window_size)
        mean_raw_signal_r = np.zeros(self.window_size)
        mean_bc_signal_r = np.zeros(self.window_size)

        mean_bias_signal_f = np.zeros(self.window_size)
        mean_bias_signal_r = np.zeros(self.window_size)
        num_sites = 0

        mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites")
        mpbs_regions.read_bed(self.motif_file)

        total_nc_signal = 0
        total_nl_signal = 0
        total_nr_signal = 0

        for region in mpbs_regions:
            if str(region.name).split(":")[-1] == "Y":
                num_sites += 1
                # Extend by 50 bp
                mid = (region.initial + region.final) / 2
                p1 = mid - (self.window_size / 2)
                p2 = mid + (self.window_size / 2)

                if not self.strands_specific:
                    # Fetch raw signal
                    raw_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                  downstream_ext=self.atac_downstream_ext,
                                                  upstream_ext=self.atac_upstream_ext,
                                                  forward_shift=self.atac_forward_shift,
                                                  reverse_shift=self.atac_reverse_shift,
                                                  genome_file_name=genome_data.get_genome())

                    mean_raw_signal = np.add(mean_raw_signal, raw_signal)

                    # Fetch bias correction signal
                    bc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())

                    mean_bc_signal = np.add(mean_bc_signal, bc_signal)
                else:
                    raw_signal_f, _, raw_signal_r, _ =  signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                        downstream_ext=self.atac_downstream_ext,
                                                                        upstream_ext=self.atac_upstream_ext,
                                                                        forward_shift=self.atac_forward_shift,
                                                                        reverse_shift=self.atac_reverse_shift,
                                                                        genome_file_name=genome_data.get_genome())
                    mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f)
                    mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r)

                    bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                                  bias_table=table,
                                                                                  downstream_ext=self.atac_downstream_ext,
                                                                                  upstream_ext=self.atac_upstream_ext,
                                                                                  forward_shift=self.atac_forward_shift,
                                                                                  reverse_shift=self.atac_reverse_shift,
                                                                                  genome_file_name=genome_data.get_genome())
                    mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f)
                    mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r)

                # Update pwm
                aux_plus = 1
                dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()
                if (region.final - region.initial) % 2 == 0:
                    aux_plus = 0
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom,
                                                                         p1 + aux_plus, p2 + aux_plus)).upper())
                if region.orientation == "+":
                    for i in range(0, len(dna_seq)):
                        pwm_dict[dna_seq[i]][i] += 1
                elif region.orientation == "-":
                    for i in range(0, len(dna_seq_rev)):
                        pwm_dict[dna_seq_rev[i]][i] += 1

                # Create bias signal
                bias_table_f = table[0]
                bias_table_r = table[1]
                self.k_nb = len(bias_table_f.keys()[0])
                bias_signal_f = []
                bias_signal_r = []
                p1_wk = p1 - int(self.k_nb / 2)
                p2_wk = p2 + int(self.k_nb / 2)
                dna_seq = str(fasta.fetch(region.chrom, p1_wk, p2_wk - 1)).upper()
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper())
                for i in range(int(self.k_nb / 2), len(dna_seq) - int(self.k_nb / 2) + 1):
                    fseq = dna_seq[i - int(self.k_nb / 2):i + int(self.k_nb / 2)]
                    rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) - i:len(dna_seq) + int(self.k_nb / 2) - i]
                    try:
                        bias_signal_f.append(bias_table_f[fseq])
                    except Exception:
                        bias_signal_f.append(1)
                    try:
                        bias_signal_r.append(bias_table_r[rseq])
                    except Exception:
                        bias_signal_r.append(1)

                mean_bias_signal_f = np.add(mean_bias_signal_f, np.array(bias_signal_f))
                mean_bias_signal_r = np.add(mean_bias_signal_r, np.array(bias_signal_r))

                if self.protection_score:
                    # signal in the center of the MPBS
                    p1 = region.initial
                    p2 = region.final
                    nc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nc_signal += sum(nc_signal)
                    p1 = region.final
                    p2 = 2 * region.final - region.initial
                    nr_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nr_signal += sum(nr_signal)
                    p1 = 2 * region.initial - region.final
                    p2 = region.final
                    nl_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nl_signal += sum(nl_signal)


        mean_raw_signal = mean_raw_signal / num_sites
        mean_bc_signal = mean_bc_signal / num_sites

        mean_raw_signal_f = mean_raw_signal_f / num_sites
        mean_raw_signal_r = mean_raw_signal_r / num_sites
        mean_bc_signal_f = mean_bc_signal_f / num_sites
        mean_bc_signal_r = mean_bc_signal_r / num_sites

        mean_bias_signal_f = mean_bias_signal_f / num_sites
        mean_bias_signal_r = mean_bias_signal_r / num_sites

        protection_score = (total_nl_signal + total_nr_signal - 2 * total_nc_signal) / (2 * num_sites)

        # Output PWM and create logo
        pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.motif_name))
        pwm_file = open(pwm_fname,"w")
        for e in ["A","C","G","T"]:
            pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]])+"\n")
        pwm_file.close()

        logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.motif_name))
        pwm = motifs.read(open(pwm_fname), "pfm")
        pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line="100",
                    color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="",
                    show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="",
                    show_fineprint=False, show_ends=False)

        # Output the raw, bias corrected signal and protection score
        output_fname = os.path.join(self.output_loc, "{}.txt".format(self.motif_name))
        output_file = open(output_fname, "w")
        if not self.strands_specific:
            output_file.write("raw signal: \n" + np.array_str(mean_raw_signal) + "\n")
            output_file.write("bias corrected signal: \n" + np.array_str(mean_bc_signal) + "\n")
        else:
            output_file.write("raw forward signal: \n" + np.array_str(mean_raw_signal_f) + "\n")
            output_file.write("bias corrected forward signal: \n" + np.array_str(mean_bc_signal_f) + "\n")
            output_file.write("raw reverse signal: \n" + np.array_str(mean_raw_signal_r) + "\n")
            output_file.write("bias reverse corrected signal: \n" + np.array_str(mean_bc_signal_r) + "\n")
        output_file.write("forward bias signal: \n" + np.array_str(mean_bias_signal_f) + "\n")
        output_file.write("reverse bias signal: \n" + np.array_str(mean_bias_signal_r) + "\n")
        if self.protection_score:
            output_file.write("protection score: \n" + str(protection_score) + "\n")
        output_file.close()

        if self.strands_specific:
            fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0))
        else:
            fig, (ax1, ax2) = plt.subplots(2)
        x = np.linspace(-50, 49, num=self.window_size)

        ax1.plot(x, mean_bias_signal_f, color='red', label='Forward')
        ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse')

        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.spines['top'].set_visible(False)
        ax1.spines['right'].set_visible(False)
        ax1.spines['left'].set_position(('outward', 15))
        ax1.spines['bottom'].set_position(('outward', 5))
        ax1.tick_params(direction='out')

        ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax1.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r))
        max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r))
        ax1.set_yticks([min_bias_signal, max_bias_signal])
        ax1.set_yticklabels([str(round(min_bias_signal,2)), str(round(max_bias_signal,2))], rotation=90)

        ax1.text(-48, max_bias_signal, '# Sites = {}'.format(str(num_sites)), fontweight='bold')
        ax1.set_title(self.motif_name, fontweight='bold')
        ax1.set_xlim(-50, 49)
        ax1.set_ylim([min_bias_signal, max_bias_signal])
        ax1.legend(loc="upper right", frameon=False)
        ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold')

        if not self.strands_specific:
            mean_raw_signal = self.standardize(mean_raw_signal)
            mean_bc_signal = self.standardize(mean_bc_signal)
            ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected')
            ax2.plot(x, mean_bc_signal, color='green', label='Corrected')
        else:
            mean_raw_signal_f = self.standardize(mean_raw_signal_f)
            mean_raw_signal_r = self.standardize(mean_raw_signal_r)
            mean_bc_signal_f = self.standardize(mean_bc_signal_f)
            mean_bc_signal_r = self.standardize(mean_bc_signal_r)
            ax2.plot(x, mean_raw_signal_f, color='red', label='Forward')
            ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse')
            ax3.plot(x, mean_bc_signal_f, color='red', label='Forward')
            ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse')

        ax2.xaxis.set_ticks_position('bottom')
        ax2.yaxis.set_ticks_position('left')
        ax2.spines['top'].set_visible(False)
        ax2.spines['right'].set_visible(False)
        ax2.spines['left'].set_position(('outward', 15))
        ax2.tick_params(direction='out')
        ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax2.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        ax2.set_yticks([0, 1])
        ax2.set_yticklabels([str(0), str(1)], rotation=90)
        ax2.set_xlim(-50, 49)
        ax2.set_ylim([0, 1])

        if not self.strands_specific:
            ax2.spines['bottom'].set_position(('outward', 40))
            ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax2.set_ylabel("Average ATAC-seq \nSignal", rotation=90, fontweight='bold')
            ax2.legend(loc="center", frameon=False, bbox_to_anchor=(0.85, 0.06))
        else:
            ax2.spines['bottom'].set_position(('outward', 5))
            ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal", rotation=90, fontweight='bold')
            ax2.legend(loc="lower right", frameon=False)

            ax3.xaxis.set_ticks_position('bottom')
            ax3.yaxis.set_ticks_position('left')
            ax3.spines['top'].set_visible(False)
            ax3.spines['right'].set_visible(False)
            ax3.spines['left'].set_position(('outward', 15))
            ax3.tick_params(direction='out')
            ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
            ax3.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
            ax3.set_yticks([0, 1])
            ax3.set_yticklabels([str(0), str(1)], rotation=90)
            ax3.set_xlim(-50, 49)
            ax3.set_ylim([0, 1])
            ax3.legend(loc="lower right", frameon=False)
            ax3.spines['bottom'].set_position(('outward', 40))
            ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax3.set_ylabel("Average ATAC-seq \n Corrected Signal", rotation=90, fontweight='bold')
            ax3.text(-48, 0.05, '# K-mer = {}\n# Forward Shift = {}'.format(str(self.k_nb), str(self.atac_forward_shift)),
                     fontweight='bold')

        figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.motif_name))
        fig.subplots_adjust(bottom=.2, hspace=.5)
        fig.tight_layout()
        fig.savefig(figure_name, format="eps", dpi=300)

        # Creating canvas and printing eps / pdf with merged results
        output_fname = os.path.join(self.output_loc, "{}.eps".format(self.motif_name))
        c = pyx.canvas.canvas()
        c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
        if self.strands_specific:
            c.insert(pyx.epsfile.epsfile(2.76, 1.58, logo_fname, width=27.2, height=2.45))
        else:
            c.insert(pyx.epsfile.epsfile(2.5, 1.54, logo_fname, width=16, height=1.75))
        c.writeEPSfile(output_fname)
        os.system("epstopdf " + figure_name)
        os.system("epstopdf " + logo_fname)
        os.system("epstopdf " + output_fname)
Esempio n. 34
0
	test_seq=Seq(str(record.seq), m.alphabet)

	out.write(str(record.id) + "\t" + file1 + "\t" + str(record.seq) + "\n")

	if(check == True):

		#list = open(sys.argv[7]).readlines()
		#print list
		
		for l in list:
			for filename in glob.glob("/home/ngs/vivek/python/scripts/matrices/converted/Homo_sapiens/"+ l.rstrip() +"_top10align_pfm.txt"):
        	                #print filename
	
        	                fname = os.path.basename(filename)
                	        fname = re.sub(r"_top10align_pfm.txt","",str(fname))
                        	test_pwm = motifs.read(open(filename), "pfm")
                       		pwm = test_pwm.counts.normalize(pseudocounts=0.5)
                        	pssm = pwm.log_odds(background)
                        	IUPAC = test_pwm.counts.degenerate_consensus

                        	for position, score in pssm.search (test_seq, threshold = float(sys.argv[2])):
					print str(record.id) + "\t" + str(abs(position)) +"\t"+ str(abs(position)+len(IUPAC)) +"\t"+ str("+" if position > 0 else "-") +"\t"+ str(fname) +"\t"+ str(data[fname]) +"\t"+ str(IUPAC) +"\t"+ str(score) +"\t"+ str(file1) +"\t"+ str(now) +"\t"+ str(file1)


	else:
		test_pwm = motifs.read(open(sys.argv[7]), "pfm")
        	#print sys.argv[7]
		pwm = test_pwm.counts.normalize(pseudocounts=0.5)
                pssm = pwm.log_odds(background)
                IUPAC = test_pwm.counts.degenerate_consensus
		
Esempio n. 35
0
# Import
import sys
from Bio import motifs
from glob import glob
from copy import deepcopy

# Reading input
inList = ["./ZBT7B_M00405.pfm", "./MA0138.2.REST.pfm", "./MA0527.1.ZBTB33.pfm"]

# Execution
for inFileName in inList:
    inFile = open(inFileName, "r")
    outFileName = inFileName[:-3] + "pdf"
    pwm = motifs.read(inFile, "pfm")

    # Revert complement
    tempA = deepcopy(pwm.counts["A"][::-1])
    pwm.counts["A"] = pwm.counts["T"][::-1]
    pwm.counts["T"] = tempA
    tempC = deepcopy(pwm.counts["C"][::-1])
    pwm.counts["C"] = pwm.counts["G"][::-1]
    pwm.counts["G"] = tempC

    # Complement
    #tempA = deepcopy(pwm.counts["A"])
    #pwm.counts["A"] = pwm.counts["T"]
    #pwm.counts["T"] = tempA
    #tempC = deepcopy(pwm.counts["C"])
    #pwm.counts["C"] = pwm.counts["G"]
    #pwm.counts["G"] = tempC
Esempio n. 36
0
from Bio import SeqIO
from Bio.Alphabet import IUPAC
chp_list = list(SeqIO.parse("srf_chip.fasta", "fasta", IUPAC.unambiguous_dna))

count = 0
for dna in chp_list:
	match = dna.seq.upper().count('GCCCATATATGG') # .upper vs. .upper()
	count = count+match

import re

chp_match = 0
for dna in chp_list:
	if re.search(r'[GT][CA]CC[AT]TATA[AT]GG', str(dna.seq)): # dna vs. str(dna.seq))
		chp_match = chp_match+1

from Bio import motifs

srf_m = motifs.read(open("MA0083.1.sites"), "sites")
srf_m.pseudocounts = 1
srf_m.background = 0.4

for dna in chp_list: # put into for loop
	for pos, score in srf_m.pssm.search(dna.seq, threshold=7.0): # use .seq atrribute
		print "Position %d: score = %5.2f" % (pos, score)
Esempio n. 37
0
#! /usr/bin/env python3

from Bio import SeqIO	
from Bio.Alphabet import IUPAC		
from Bio.Seq import Seq
from Bio import motifs			
from Bio import SeqUtils


with open("sites/MA0106.1.sites") as handle:
     p53 = motifs.read(handle, "sites")

motif = p53.degenerate_consensus

with open("output/motif_result_p53.txt","w") as f:
	for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'):
		f.write(">" + str(seq_record.id) + "\n")
		result=SeqUtils.nt_search(str(seq_record), m)
		f.write(str(result) + "\n")

##

with open("sites/MA0001.1.sites") as handle:
     AGL3 = motifs.read(handle, "sites")

motif = AGL3.degenerate_consensus

with open("output/motif_result_AGL3.txt","w") as f:
	for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'):
		f.write(">" + str(seq_record.id) + "\n")
		result=SeqUtils.nt_search(str(seq_record), motif)
Esempio n. 38
0
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC, generic_dna, generic_protein
from collections import defaultdict

#####################
## sys Inputs - to do
#####################

## read in alignment and motif 
try:
    alignment = list(SeqIO.parse(sys.argv[1], "fasta"))
except:
    print ("ERROR This is not a fasta alignment file")
    sys.exit()
try:
    motif = motifs.read(open(sys.argv[2]), "pfm")
except:
    print ("ERROR This is not a pfm file")
    sys.exit()
try:
    threshold = sys.argv[3]
except IndexError:
    threshold = -10000

# Used later when marking output file
alignment_file_name =  os.path.basename(sys.argv[1])
motif_file_name =  os.path.basename(sys.argv[2])

print ("alignment file: " + alignment_file_name)
print ("motif file: " + motif_file_name)
Esempio n. 39
0
    #threshold= math.pow(10,-float(sys.argv[2])) # P-val, 5 means -log(n,10)
    out = open(out_file, 'w')
    out.write(
        "Sequence\tMotif\tMotif_consensus\tHit_Seq\tHit_position\tHit_score\tThreshold\n"
    )

    # Specify if you only want to include certain pwm's in the directory
    if include == "all":
        include_if = os.listdir(f)
    else:
        include_if = open(include, 'r').read().splitlines()

    if f[-4:] == ".pwm":
        #Make PSSM for mapping
        motif_name = f.strip().split("/")[-1]
        m = motifs.read(open(f), "pfm")
        motif_len = len(m)
        pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p)
        hits = mapping(seq_file, pssm, threshold, consensus, motif_name,
                       motif_len)

    else:
        for motif_file in os.listdir(f):
            if motif_file in include_if:
                motif_name = motif_file.strip().split("/")[-1]
                print(motif_name)
                path = f + motif_file
                m = motifs.read(open(path), "pfm")
                motif_len = len(m)
                pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p)
                hits = mapping(seq_file, pssm, threshold, consensus,
Esempio n. 40
0
    repositories = set(repositories)
    query = set(args.folders)

    if not repositories.issuperset(query):
        print("ERROR: query repositories %s do not exist" % str(list(query.difference(repositories))))
        exit(1)

    repositories = args.folders

print(">>> CREATING logos for", repositories)

for repo in repositories:
    dir_name = path.join(curr_dir, "motifs", repo)
    for _, _, file_list in walk(dir_name):
        output_dir = path.join(curr_dir, "logos", repo)

        if not path.exists(output_dir):
            mkdir(output_dir)

        print(">>", repo)

        for pwm_file_name in file_list:
            pwm_full_file_name = path.join(dir_name, pwm_file_name)
            if pwm_file_name.split(".")[-1] != "pwm":
                continue
            pwm_file = open(pwm_full_file_name, "r")
            logo_file_name = path.join(output_dir, ".".join(pwm_file_name.split(".")[:-1]) + ".png")
            pwm = motifs.read(pwm_file, "pfm")
            pwm.weblogo(logo_file_name, format="png_print", stack_width="medium", color_scheme="color_classic")
            pwm_file.close()
Esempio n. 41
0
fprList = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
pseudocounts = 0.1
background = {'A':0.25,'C':0.25,'G':0.25,'T':0.25}
precision = 10000

# Creating output file
outFile = open(outFileName,"w")
outFile.write("\t".join(["MOTIF"]+[str(e) for e in fprList])+"\n")

# Iterating on all PWMs
for pwmFileName in glob(inFolder+"*.pwm"):

  # Creating PSSM
  name = ".".join(basename(pwmFileName).split(".")[:-1])
  input_file = open(pwmFileName,"r")
  pfm = motifs.read(input_file, "pfm")
  pwm = pfm.counts.normalize(pseudocounts)
  input_file.close()
  pssm = pwm.log_odds(background)
  pssm_list = [pssm[e] for e in ["A","C","G","T"]]
  distribution = pssm.distribution(background=background, precision=precision)

  # Evaluating thresholds
  resVec = [name]
  for fpr in fprList:
    resVec.append(str(distribution.threshold_fpr(fpr)))
    
  # Writing results
  outFile.write("\t".join(resVec)+"\n")

Esempio n. 42
0
#    tf_class, tf_family: structuras class & family of motif
#    species: as taxonomy IDs
#    tax_group: taxonomic supergroup of motif
#    acc: accesion number of transcription factor protein
#    data_type: type of data used to construct the motif
#    medline: Pubmed ID or literature supporting motif
#    pazar_id: reference ID to PAZAR DB
#    comment: text, notes about motif
# stores motifs in 3 main formats: 2 flat files & SQL DB

# JASPAR sites format
#>ID name count
#seqseqMOTIFMOTIFseqseq
# no added meta info
with open('Arnt.sites') as handle:
    arnt = motifs.read(handle, 'sites')    #motif format 'sites'
print len(arnt.instances), arnt.instances
print arnt.counts

# JASPAR pfm format
#2 9 0 1 32 4
#1 33 4 51 1 0
#9 3 10 0 0 0
#20 0 31 0 0 50
# only count profile matrix
with open('SRF.pfm') as handle:
    srf = motifs.read(handle, 'pfm')    # motif format 'pfm'
print srf.counts
print srf.instances    # direct matrix, it didn't save instances
print arnt.counts.consensus, srf.counts.consensus
from sys import argv
import numpy as np
from Bio import motifs

name = str(argv[1]).split('.')[0]
name_pfm = name + '.pfm'
name_transfac = name + '.transfac'
mat = np.genfromtxt(argv[1])
mat_trans = mat.transpose()

t = ''
for i in range(len(mat_trans)):
	for j in  mat_trans[i]:
		t += str(int(j))
		t += ' '
	t += '\n'

with open(name_pfm, 'w') as pfm_out:
	pfm_out.write(t)

mot = motifs.read(open(name_pfm), 'pfm')
print mot.format("transfac")

# with open(name_transfac, 'w') as transfac_out:
# 	transfac_out.write(mot.format("transfac"))