Example #1
0
def readFastaSequences(fileName, out_type='DataSet'):
    """
    Reads a file in fasta format and returns the sequence in a DataSet object

    @param fileName: Name of the input file
    @param: type of output object: DataSet or ConstrainedDataSet
    @return: list of sequence lists
    """
    f = open(fileName, "r")
    index = -1
    seqM = []
    nameList = []
    partSeq = ""
    nameReg = re.compile("^\>(.*)")

    try:
        while 1 == 1:
            line = f.next()
            s = nameReg.search(line)
            if s:
                if index != -1:
                    if partSeq[len(partSeq) - 2:len(partSeq)] == "//":
                        partSeq = partSeq[0:len(partSeq) - 2]

                    partSeq = partSeq.upper(
                    )  # upper case letters by convention
                    seqM.append(list(partSeq))
                partSeq = ""
                index += 1
                nameList.append(mixture.chomp(s.group(1)))
            else:
                partSeq += mixture.chomp(line)

    except StopIteration:
        if partSeq[len(partSeq) - 2:len(partSeq)] == "//":
            partSeq = partSeq[0:len(partSeq) - 2]
        partSeq = partSeq.upper()
        seqM.append(list(partSeq))

    if out_type == 'DataSet':
        data = mixture.DataSet()
    elif out_type == 'ConstrainedDataSet':
        data = mixture.ConstrainedDataSet()
    else:
        raise TypeError, 'Invalid output type ' + str(out_type)

    data.fromList(seqM, IDs=nameList)

    return data
Example #2
0
def readFastaSequences(fileName, out_type='DataSet'):
    """
    Reads a file in fasta format and returns the sequence in a DataSet object

    @param fileName: Name of the input file
    @param: type of output object: DataSet or ConstrainedDataSet
    @return: list of sequence lists
    """
    f = open(fileName,"r")
    index = -1
    seqM = []
    nameList = []
    partSeq = ""
    nameReg = re.compile("^\>(.*)")

    try:
        while 1==1:
            line = f.next()
            s = nameReg.search(line)
            if s:
                if index != -1:
                    if partSeq[len(partSeq)-2:len(partSeq)] == "//":
                        partSeq = partSeq[0:len(partSeq)-2]

                    partSeq = partSeq.upper() # upper case letters by convention
                    seqM.append(list(partSeq))
                partSeq = ""
                index +=1
                nameList.append(mixture.chomp(s.group(1) ))
            else:
                partSeq += mixture.chomp(line)

    except StopIteration:
        if partSeq[len(partSeq)-2:len(partSeq)] == "//":
            partSeq = partSeq[0:len(partSeq)-2]
        partSeq = partSeq.upper()
        seqM.append(list(partSeq))

    if out_type == 'DataSet':
        data = mixture.DataSet()
    elif out_type == 'ConstrainedDataSet':
        data = mixture.ConstrainedDataSet()
    else:
        raise TypeError, 'Invalid output type ' + str(out_type)

    data.fromList(seqM,IDs=nameList)

    return data
Example #3
0
def readUCSCPrior(filename):
    """
    Reads files in the UCSC Dirichlet Mixture prior (DMP) format (http://www.soe.ucsc.edu/compbio/dirichlets/)
    and converts them into PyMix DirichletMixturePrior objects.

    Note that the alphabet in the UCSC priors does not contain the gap symbol. For the output DirichletMixturePrior
    the gap symbol is introduced with a parameter value of 0.01 in all components.

    @param filename: file in UCSC DMP format

    @return: DirichletMixturePrior object

    """
    f = open(filename, 'r')

    ex1 = re.compile('Mixture=\s(\d+.\d+)')
    ex2 = re.compile('Order\s*=\s+([A-Z\s]+)')
    ex3 = re.compile('Alpha=\s+([\d+.\d+\s+,\d+e-]+)')

    pi = []
    sigma = None
    dComp = []
    alpha_mat = []

    for l in f:
        l = mixture.chomp(l)
        m1 = ex1.match(l)
        if m1:
            pi.append(float(m1.groups(1)[0]))
        m2 = ex2.match(l)
        if m2:
            s = m2.groups(1)[0]
            sigma = s.split(' ')

        m3 = ex3.match(l)
        if m3:
            s = m3.groups(1)[0]
            alpha = s.split(' ')
            alpha = map(float, alpha)
            alpha.pop(0)  # first entry is the sum of the others -> remove
            alpha_mat.append(alpha)

    # intergrate gap character '-' into the alphabet
    sigma.append('-')
    alphabet = mixture.Alphabet(sigma)

    for i in range(len(alpha_mat)):
        alpha_mat[i].append(0.01)  # add hyper paramerter for '-'
        dComp.append(mixture.DirichletPrior(21, alpha_mat[i]))

    prior = mixture.DirichletMixturePrior(len(dComp), 21, pi, dComp)
    return alphabet, prior
def readUCSCPrior(filename):
    """
    Reads files in the UCSC Dirichlet Mixture prior (DMP) format (http://www.soe.ucsc.edu/compbio/dirichlets/)
    and converts them into PyMix DirichletMixturePrior objects.

    Note that the alphabet in the UCSC priors does not contain the gap symbol. For the output DirichletMixturePrior
    the gap symbol is introduced with a parameter value of 0.01 in all components.

    @param filename: file in UCSC DMP format

    @return: DirichletMixturePrior object

    """
    f = open(filename,'r')

    ex1 = re.compile('Mixture=\s(\d+.\d+)')
    ex2 = re.compile('Order\s*=\s+([A-Z\s]+)')
    ex3 = re.compile('Alpha=\s+([\d+.\d+\s+,\d+e-]+)')

    pi = []
    sigma = None
    dComp = []
    alpha_mat = []

    for l in f:
        l = mixture.chomp(l)
        m1 = ex1.match(l)
        if m1:
            pi.append( float(m1.groups(1)[0]))
        m2 = ex2.match(l)
        if m2:
            s = m2.groups(1)[0]
            sigma = s.split(' ')

        m3 = ex3.match(l)
        if m3:
            s = m3.groups(1)[0]
            alpha = s.split(' ')
            alpha = map(float,alpha)
            alpha.pop(0) # first entry is the sum of the others -> remove
            alpha_mat.append(alpha)

    # intergrate gap character '-' into the alphabet
    sigma.append('-')
    alphabet = mixture.Alphabet(sigma)

    for i in range(len(alpha_mat)):
        alpha_mat[i].append(0.01) # add hyper paramerter for '-'
        dComp.append( mixture.DirichletPrior(21,alpha_mat[i]) )

    prior = mixture.DirichletMixturePrior(len(dComp),21,pi,dComp)
    return alphabet,prior
Example #5
0
def readSites(fileName):
    """
    Flat file parser for the JASPAR .sites  format. The files are essentially fasta but
    there is a count matrix at the end of the file.

    @param fileName: File name of .sites file
    @return: DataSet object
    """
    f = open(fileName,"r")

    seq_head = re.compile("^\>(.*)")
    end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[")
    seq = []

    ids = []
    count = 1
    for line in f:
        line = mixture.chomp(line)
        #line = line.upper()
        s = seq_head.search(line)
        if s:
            #print s.groups(1)[0]
            tl = s.groups(1)[0].split('\t')

            ids.append(str(tl[1])+'_'+str(tl[2]))


            #ids.append('seq'+str(count))
            #count +=1
            #print s.group(1)

        elif end.search(line):
            break
        else:
            if len(line) > 0:
                line = list(line)

                # remove lower case letters, only upper case letters are part of the
                # binding site
                site = []
                for i,s in enumerate(line):
                    if s.isupper():
                        site.append(s)

                seq.append(site)
                #print len(site)

    data = mixture.DataSet()
    data.fromList(seq, IDs = ids)

    return data
Example #6
0
def readSites(fileName):
    """
    Flat file parser for the JASPAR .sites  format. The files are essentially fasta but
    there is a count matrix at the end of the file.

    @param fileName: File name of .sites file
    @return: DataSet object
    """
    f = open(fileName, "r")

    seq_head = re.compile("^\>(.*)")
    end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[")
    seq = []

    ids = []
    count = 1
    for line in f:
        line = mixture.chomp(line)
        #line = line.upper()
        s = seq_head.search(line)
        if s:
            #print s.groups(1)[0]
            tl = s.groups(1)[0].split('\t')

            ids.append(str(tl[1]) + '_' + str(tl[2]))

            #ids.append('seq'+str(count))
            #count +=1
            #print s.group(1)

        elif end.search(line):
            break
        else:
            if len(line) > 0:
                line = list(line)

                # remove lower case letters, only upper case letters are part of the
                # binding site
                site = []
                for i, s in enumerate(line):
                    if s.isupper():
                        site.append(s)

                seq.append(site)
                #print len(site)

    data = mixture.DataSet()
    data.fromList(seq, IDs=ids)

    return data
Example #7
0
def readAlnData(fn,reg_str=None, out_type='DataSet'):
    """
    Parses a CLUSTALW format .aln multiple alignment file and returns a mixture.DataSet object.

    @param reg_str: regular expression for sequence parsing
    @param: type of output object: DataSet or ConstrainedDataSet
    @return: DataSet object
    """

    f = open(fn,'r')
    if reg_str:
        parse = re.compile(reg_str)
    else:
        parse = re.compile("(\w+\|\w+)\s+([\w,\-,.]+)")

    d = {}
    f.readline()  # remove first line

    for l in f:
        l = mixture.chomp(l)
        pat = parse.search(l)
        if pat:
            k =  pat.group(1)
            seq = pat.group(2)
            if k in d.keys():
                d[k] += seq
            else:
                d[k] = seq

        else:
            continue

    if out_type == 'DataSet':
        data = mixture.DataSet()
    elif out_type == 'ConstrainedDataSet':
        data = mixture.ConstrainedDataSet()
    else:
        raise TypeError, 'Invalid output type ' + str(out_type)

    sIDs = d.keys()
    dMatrix = []
    for z in d.keys():
        dMatrix.append(list(d[z]))

    data.fromList(dMatrix,IDs = sIDs)

    return data
Example #8
0
def readAlnData(fn, reg_str=None, out_type='DataSet'):
    """
    Parses a CLUSTALW format .aln multiple alignment file and returns a mixture.DataSet object.

    @param reg_str: regular expression for sequence parsing
    @param: type of output object: DataSet or ConstrainedDataSet
    @return: DataSet object
    """

    f = open(fn, 'r')
    if reg_str:
        parse = re.compile(reg_str)
    else:
        parse = re.compile("(\w+\|\w+)\s+([\w,\-,.]+)")

    d = {}
    f.readline()  # remove first line

    for l in f:
        l = mixture.chomp(l)
        pat = parse.search(l)
        if pat:
            k = pat.group(1)
            seq = pat.group(2)
            if k in d.keys():
                d[k] += seq
            else:
                d[k] = seq

        else:
            continue

    if out_type == 'DataSet':
        data = mixture.DataSet()
    elif out_type == 'ConstrainedDataSet':
        data = mixture.ConstrainedDataSet()
    else:
        raise TypeError, 'Invalid output type ' + str(out_type)

    sIDs = d.keys()
    dMatrix = []
    for z in d.keys():
        dMatrix.append(list(d[z]))

    data.fromList(dMatrix, IDs=sIDs)

    return data
Example #9
0
def readJASPAR(fileName):
    """
    Reads a flat file of JASPAR binding sites matrices. JASPAR files are
    essentially fasta, but only upper case letters are part of the binding site proper.
    Lower case letters are discarded.

    """
    f = open(fileName,"r")

    seq_head = re.compile("^\>(.*)")
    end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[")
    seq = []

    ids = []
    count = 1
    for line in f:
        line = mixture.chomp(line)
        #line = line.upper()
        s = seq_head.search(line)
        if s:
            ids.append('seq'+str(count))
            count +=1
            #print s.group(1)

        elif end.search(line):
            break
        else:
            if len(line) > 0:
                line = list(line)

                # remove lower case letters, only upper case letters are part of the
                # binding site
                site = []
                for i,s in enumerate(line):
                    if s.isupper():
                        site.append(s)

                seq.append(site)
                #print len(site)

    data = mixture.DataSet()
    data.fromList(seq, IDs = ids)

    return data
Example #10
0
def readJASPAR(fileName):
    """
    Reads a flat file of JASPAR binding sites matrices. JASPAR files are
    essentially fasta, but only upper case letters are part of the binding site proper.
    Lower case letters are discarded.

    """
    f = open(fileName, "r")

    seq_head = re.compile("^\>(.*)")
    end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[")
    seq = []

    ids = []
    count = 1
    for line in f:
        line = mixture.chomp(line)
        #line = line.upper()
        s = seq_head.search(line)
        if s:
            ids.append('seq' + str(count))
            count += 1
            #print s.group(1)

        elif end.search(line):
            break
        else:
            if len(line) > 0:
                line = list(line)

                # remove lower case letters, only upper case letters are part of the
                # binding site
                site = []
                for i, s in enumerate(line):
                    if s.isupper():
                        site.append(s)

                seq.append(site)
                #print len(site)

    data = mixture.DataSet()
    data.fromList(seq, IDs=ids)

    return data