def paragraph(self, **kwds):
        from Paragraph import Paragraph
        paragraph = Paragraph(**kwds)

        self.contents.append(paragraph)

        return paragraph
Example #2
0
    def paragraph(self, paragraph=None):
        if paragraph is None:
            from Paragraph import Paragraph
            paragraph = Paragraph()
            return paragraph

        self._entries.append(paragraph)
        return
Example #3
0
 def __init__(self,paragraph_text,background_string,overshooting=True,
              background_scaling_factor=100,
              ncr_size=32):
     """Initialize an organism from a paragraph and a background"""
     paragraph_lines = paragraph_text.split("\n")
     self.paragraph = Paragraph(paragraph_lines)
     self.targets = [seq.tProb for seq in self.paragraph.seq_data]
     self.motif = self.paragraph.motif()
     self.original_motif = self.motif[:] #copy for backup against mutations
     self.recognizer = self.paragraph.recognizer
     self.background = background_string
     self.memo_background_Z = None
     self.overshooting = overshooting
     self.background_scaling_factor = background_scaling_factor
     self.ncr_size = 32
Example #4
0
def compile(input_names, output_name, folder='Biology'):
    print(input_names)
    para = []
    for nb, file_name in enumerate(input_names):
        file = open(file_name, 'r')
        txt = file.read()

        #delete the titles
        b = txt.find('##')
        while b != -1:
            e = txt.find('\n', b)
            txt = txt[:b] + txt[e:]
            b = txt.find('##')

        #replace the links by the websites scrapped
        b = txt.find('$')
        while b != -1:
            e1 = txt.find('\n', b)
            e2 = txt.find(' ', b)
            link = txt[b + 1:e2]
            arg = txt[e2 + 1:e1]
            #print(link, ';', arg)
            website = scrap.get_from_url(link, int(arg))
            #print(website)
            txt = txt[:b] + website + txt[e1:]
            b = txt.find('$')

        #extract paragraphes
        txt = txt.split('\n\n')
        for i, p in enumerate(txt):
            p = p.replace('\n', '').replace('\t', '')
            para.append(Paragraph(file_name, i, keywords(p), p))
        file.close()

    newPara = []
    while len(para) > 0:
        sameKeyWrdPara = [
            Paragraph(para[0].file, para[0].nb, para[0].keywords, para[0].text)
        ]
        i = 1
        while i < len(para):
            p = para[i]
            if para[0].eq(p):
                sameKeyWrdPara.append(
                    Paragraph(p.file, p.nb, p.keywords, p.text))
                para.remove(p)
            else:
                i += 1
        para.remove(para[0])
        newPara.append(sameKeyWrdPara)

    # for p in newPara:
    #     print(p)

    for paras in newPara:
        j = 0
        while j < len(paras):
            i = j + 1
            while i < len(paras):
                simEval = similarity(paras[j].text, paras[i].text)
                if simEval >= 0.75:
                    paras.remove(paras[i])
                else:
                    i += 1
            j += 1

    #print(" ")
    #for p in newPara:
    #    print(p)

    output = open(output_name, 'w')
    output.write('# ' + folder + '\n')
    for paras in newPara:
        if len(paras) > 1:
            output.write('\n## ' + " ".join(paras[0].common(paras[1])) + '\n')
        elif len(paras[0].keywords) > 0:
            output.write('\n## ' + paras[0].keywords[0] + '\n')
        for p in paras:
            if len(p.text) > 5:
                output.write("\t" + p.text + "\n")
    output.close()
Example #5
0
class Organism(object):
    """Emulate an ESTReMo-style organism"""
    def __init__(self,paragraph_text,background_string,overshooting=True,
                 background_scaling_factor=100,
                 ncr_size=32):
        """Initialize an organism from a paragraph and a background"""
        paragraph_lines = paragraph_text.split("\n")
        self.paragraph = Paragraph(paragraph_lines)
        self.targets = [seq.tProb for seq in self.paragraph.seq_data]
        self.motif = self.paragraph.motif()
        self.original_motif = self.motif[:] #copy for backup against mutations
        self.recognizer = self.paragraph.recognizer
        self.background = background_string
        self.memo_background_Z = None
        self.overshooting = overshooting
        self.background_scaling_factor = background_scaling_factor
        self.ncr_size = 32
        # samples * scaling factor = effective background
        
    def fitness(self):
        site_energies = [self.recognizer.binding_energy(site)
                         for site in self.motif]
        w = len(self.motif[0])
        n = len(self.background)
        background_energies = [self.recognizer.binding_energy(self.background[i:i+w])
                                                   for i in range(n-w+1)]
        foreground_Z = sum([exp(-beta * energy) for energy in site_energies])
        Z = foreground_Z + self.background_Z()
        site_occupancies = [exp(-beta*energy)/Z for energy in site_energies]
        print "site occupancies:",site_occupancies
        return sum(self.site_fitness(occ,target)
                   for (occ,target) in zip(site_occupancies,self.targets))

    def background_Z(self):
        if not self.memo_background_Z is None:
            return self.memo_background_Z
        else:
            w = len(self.motif[0])
            n = len(self.background)
            samples = n / self.background_scaling_factor
            print "samples:",samples
            be = self.recognizer.binding_energy
            bkgd_energies = [be(random_substring(self.background,w))
                             for i in range(samples)]
            self.memo_background_Z = sum([exp(-beta * energy)
                                          for energy in bkgd_energies]) * self.background_scaling_factor
        return self.memo_background_Z
    
    def site_fitness(self,occupancy,target):
        # print "occupancy:",occupancy
        # print "target:",target
        delta = 0.17
        eta = 0.02
        M = 1.8
        Ky = 0.4  #Params come from config file
        Z = occupancy * M
        L = Ky / ((delta / eta) * (1 - target)**2 - 1)
        g = delta * ((Z * L) / (Ky + L)) - eta * (Z / (1 - (Z/M)))
        if not self.overshooting: #i.e. if overshooting is not penalized...
            # compute optimum expression level
            Zopt = M * (1 - sqrt((eta / delta) * ((L + Ky) / L)))
            # and fitness corresponding to optimum expression
            gopt = delta * ((Zopt * L) / (Ky + L)) - eta * (Zopt / (1 - (Zopt/M)))
            if Z > Zopt:
                return gopt
        #print g
        return g
    
    def mutate_site(self,site_number,position,base):
        site = self.motif[site_number]
        self.motif[site_number] = string_replace(site,position,base)

    def reset_motif(self):
        self.motif = self.original_motif[:]

    def explore_site_mutations(self):
        print self.motif
        self.original_motif = self.motif[:]
        fit = self.fitness()
        mutations = []
        for site_number in range(len(self.motif)):
            for position in range(len(self.motif[0])):
                original_base = self.motif[site_number][position]
                for base in "ACTG":
                    if original_base == base:
                        continue #don't recompute original fitness'
                    self.mutate_site(site_number,position,base)
                    fit_prime = self.fitness()
                    report_string = "Improvement" if fit_prime > fit else ""
                    diff = (fit_prime - fit)/fit
                    mutations.append((site_number,position,base,fit_prime))
                    print site_number,position,base,self.fitness(),diff,report_string
                self.reset_motif()
        return mutations

    def grad_descent(self):
        iteration = 0
        mutations = None
        motif_dustbin = []
        while (mutations is None or
               any(fit > 0 for (site,pos,base,fit) in mutations)):
            mutations = self.explore_site_mutations()
            site_idx, pos, base, fit = max(mutations,
                                           key=lambda tup:tup[3])#max by fitness
            self.motif[site_idx] = string_replace(self.motif[site_idx],pos,base)
            print "iteration:",iteration,site_idx,pos,base,fit
            motif_dustbin.append(self.motif[:])
            iteration += 1

    def serialize(self,population_size):
        """Print self out per ESTReMo's population serialization format"""
        print population_size
        for i in range(population_size):
            print i
            print len(self.motif)
            for site in self.motif:
                # TODO we should probably define the organism to
                # include the ncr.  For now, randomize the rest of the
                # ncr
                print site + random_site(self.ncr_size - len(site))
            print 1 if type(self.recognizer) is SLP else 2
            # TODO needs to be generalized to MLPs!
            weights = concat(map(list,transpose(self.recognizer.input_layer)))
            print len(weights)
            for weight in weights:
                print weight
Example #6
0
def buildParagraphs(tagSplit):
    paragraphList = []

    paraEnabled = False
    picIndex = 0

    for tags in tagSplit:
        if "<w:p" in tags and "w:rsidRDefault" in tags:
            currentParagraph = Paragraph()
            currentParagraph.setType("Paragraph")
            paraEnabled = True
        elif "</w:p>" in tags:
            paragraphList.append(currentParagraph)
        elif paraEnabled:
            currentParagraph.addElement(tags)

    for formats in paragraphList:
        inFormat = False
        formatStartIndex = 0
        i = 0

        for elements in formats.getElements():
            if "<w:pPr" in elements:
                inFormat = True
                formatStartIndex = i
            if "</w:pPr" in elements:
                inFormat = False
                listHolder = formats.getElements()
                del listHolder[formatStartIndex:i + 1]
                formats.clearElements()
                formats.setElements(listHolder)
            if "<w:pStyle w:val" in elements and inFormat:
                formats.setType(elements[elements.index('"') +
                                         1:elements.rfind('"')])
            if "<w:ilvl w:val" in elements and inFormat:
                formats.setListIndent(elements[elements.index('"') +
                                               1:elements.rfind('"')])
            if "<w:numId w:val" in elements and inFormat:
                formats.setListId(elements[elements.index('"') +
                                           1:elements.rfind('"')])
            if "<pic:nvPicPr" in elements:
                formats.setImageIndex(picIndex)
                formats.setType("Image")
                picIndex += 1
            i += 1

    for formats in paragraphList:
        currentElementList = []
        currentText = ""
        inText = False
        for elements in formats.getElements():
            if "<w:r" in elements:
                currentElement = Element()
            elif "<w:b/>" in elements:
                currentElement.bold = True
            elif "<w:strike/>" in elements:
                currentElement.setStrikethrough(True)
            elif '<w:vertAlign w:val="subscript"/>' in elements:
                currentElement.setSubscript(True)
            elif '<w:vertAlign w:val="superscript"/>' in elements:
                currentElement.setSuperscript(True)
            elif "<w:i/>" in elements:
                currentElement.italics = True
            elif "<w:u" in elements:
                currentElement.underline = True
            elif "<w:t>" in elements or '<w:t xml:space=' in elements:
                inText = True
            elif "</w:r>" in elements:
                inElement = False
                currentElementList.append(currentElement)
            elif "</w:t>" in elements and inText:
                inText = False
                currentElement.setText(currentText)
                currentText = ""
            elif inText:
                currentText = currentText + elements

        formats.clearElements()
        formats.setElements(currentElementList)

    return paragraphList
Example #7
0
 def CalculateFeatures(self, filename, paragraphpos, headername, text,
                       nofpars):
     featurerow = filename + "," + paragraphpos + ',' + headername
     pg = Paragraph(text, paragraphpos, headername, filename, nofpars,
                    self.readmedesc, self.infile)
     self.ParagraphList.append(pg)