def paragraph(self, **kwds): from Paragraph import Paragraph paragraph = Paragraph(**kwds) self.contents.append(paragraph) return paragraph
def paragraph(self, paragraph=None): if paragraph is None: from Paragraph import Paragraph paragraph = Paragraph() return paragraph self._entries.append(paragraph) return
def __init__(self,paragraph_text,background_string,overshooting=True, background_scaling_factor=100, ncr_size=32): """Initialize an organism from a paragraph and a background""" paragraph_lines = paragraph_text.split("\n") self.paragraph = Paragraph(paragraph_lines) self.targets = [seq.tProb for seq in self.paragraph.seq_data] self.motif = self.paragraph.motif() self.original_motif = self.motif[:] #copy for backup against mutations self.recognizer = self.paragraph.recognizer self.background = background_string self.memo_background_Z = None self.overshooting = overshooting self.background_scaling_factor = background_scaling_factor self.ncr_size = 32
def compile(input_names, output_name, folder='Biology'): print(input_names) para = [] for nb, file_name in enumerate(input_names): file = open(file_name, 'r') txt = file.read() #delete the titles b = txt.find('##') while b != -1: e = txt.find('\n', b) txt = txt[:b] + txt[e:] b = txt.find('##') #replace the links by the websites scrapped b = txt.find('$') while b != -1: e1 = txt.find('\n', b) e2 = txt.find(' ', b) link = txt[b + 1:e2] arg = txt[e2 + 1:e1] #print(link, ';', arg) website = scrap.get_from_url(link, int(arg)) #print(website) txt = txt[:b] + website + txt[e1:] b = txt.find('$') #extract paragraphes txt = txt.split('\n\n') for i, p in enumerate(txt): p = p.replace('\n', '').replace('\t', '') para.append(Paragraph(file_name, i, keywords(p), p)) file.close() newPara = [] while len(para) > 0: sameKeyWrdPara = [ Paragraph(para[0].file, para[0].nb, para[0].keywords, para[0].text) ] i = 1 while i < len(para): p = para[i] if para[0].eq(p): sameKeyWrdPara.append( Paragraph(p.file, p.nb, p.keywords, p.text)) para.remove(p) else: i += 1 para.remove(para[0]) newPara.append(sameKeyWrdPara) # for p in newPara: # print(p) for paras in newPara: j = 0 while j < len(paras): i = j + 1 while i < len(paras): simEval = similarity(paras[j].text, paras[i].text) if simEval >= 0.75: paras.remove(paras[i]) else: i += 1 j += 1 #print(" ") #for p in newPara: # print(p) output = open(output_name, 'w') output.write('# ' + folder + '\n') for paras in newPara: if len(paras) > 1: output.write('\n## ' + " ".join(paras[0].common(paras[1])) + '\n') elif len(paras[0].keywords) > 0: output.write('\n## ' + paras[0].keywords[0] + '\n') for p in paras: if len(p.text) > 5: output.write("\t" + p.text + "\n") output.close()
class Organism(object): """Emulate an ESTReMo-style organism""" def __init__(self,paragraph_text,background_string,overshooting=True, background_scaling_factor=100, ncr_size=32): """Initialize an organism from a paragraph and a background""" paragraph_lines = paragraph_text.split("\n") self.paragraph = Paragraph(paragraph_lines) self.targets = [seq.tProb for seq in self.paragraph.seq_data] self.motif = self.paragraph.motif() self.original_motif = self.motif[:] #copy for backup against mutations self.recognizer = self.paragraph.recognizer self.background = background_string self.memo_background_Z = None self.overshooting = overshooting self.background_scaling_factor = background_scaling_factor self.ncr_size = 32 # samples * scaling factor = effective background def fitness(self): site_energies = [self.recognizer.binding_energy(site) for site in self.motif] w = len(self.motif[0]) n = len(self.background) background_energies = [self.recognizer.binding_energy(self.background[i:i+w]) for i in range(n-w+1)] foreground_Z = sum([exp(-beta * energy) for energy in site_energies]) Z = foreground_Z + self.background_Z() site_occupancies = [exp(-beta*energy)/Z for energy in site_energies] print "site occupancies:",site_occupancies return sum(self.site_fitness(occ,target) for (occ,target) in zip(site_occupancies,self.targets)) def background_Z(self): if not self.memo_background_Z is None: return self.memo_background_Z else: w = len(self.motif[0]) n = len(self.background) samples = n / self.background_scaling_factor print "samples:",samples be = self.recognizer.binding_energy bkgd_energies = [be(random_substring(self.background,w)) for i in range(samples)] self.memo_background_Z = sum([exp(-beta * energy) for energy in bkgd_energies]) * self.background_scaling_factor return self.memo_background_Z def site_fitness(self,occupancy,target): # print "occupancy:",occupancy # print "target:",target delta = 0.17 eta = 0.02 M = 1.8 Ky = 0.4 #Params come from config file Z = occupancy * M L = Ky / ((delta / eta) * (1 - target)**2 - 1) g = delta * ((Z * L) / (Ky + L)) - eta * (Z / (1 - (Z/M))) if not self.overshooting: #i.e. if overshooting is not penalized... # compute optimum expression level Zopt = M * (1 - sqrt((eta / delta) * ((L + Ky) / L))) # and fitness corresponding to optimum expression gopt = delta * ((Zopt * L) / (Ky + L)) - eta * (Zopt / (1 - (Zopt/M))) if Z > Zopt: return gopt #print g return g def mutate_site(self,site_number,position,base): site = self.motif[site_number] self.motif[site_number] = string_replace(site,position,base) def reset_motif(self): self.motif = self.original_motif[:] def explore_site_mutations(self): print self.motif self.original_motif = self.motif[:] fit = self.fitness() mutations = [] for site_number in range(len(self.motif)): for position in range(len(self.motif[0])): original_base = self.motif[site_number][position] for base in "ACTG": if original_base == base: continue #don't recompute original fitness' self.mutate_site(site_number,position,base) fit_prime = self.fitness() report_string = "Improvement" if fit_prime > fit else "" diff = (fit_prime - fit)/fit mutations.append((site_number,position,base,fit_prime)) print site_number,position,base,self.fitness(),diff,report_string self.reset_motif() return mutations def grad_descent(self): iteration = 0 mutations = None motif_dustbin = [] while (mutations is None or any(fit > 0 for (site,pos,base,fit) in mutations)): mutations = self.explore_site_mutations() site_idx, pos, base, fit = max(mutations, key=lambda tup:tup[3])#max by fitness self.motif[site_idx] = string_replace(self.motif[site_idx],pos,base) print "iteration:",iteration,site_idx,pos,base,fit motif_dustbin.append(self.motif[:]) iteration += 1 def serialize(self,population_size): """Print self out per ESTReMo's population serialization format""" print population_size for i in range(population_size): print i print len(self.motif) for site in self.motif: # TODO we should probably define the organism to # include the ncr. For now, randomize the rest of the # ncr print site + random_site(self.ncr_size - len(site)) print 1 if type(self.recognizer) is SLP else 2 # TODO needs to be generalized to MLPs! weights = concat(map(list,transpose(self.recognizer.input_layer))) print len(weights) for weight in weights: print weight
def buildParagraphs(tagSplit): paragraphList = [] paraEnabled = False picIndex = 0 for tags in tagSplit: if "<w:p" in tags and "w:rsidRDefault" in tags: currentParagraph = Paragraph() currentParagraph.setType("Paragraph") paraEnabled = True elif "</w:p>" in tags: paragraphList.append(currentParagraph) elif paraEnabled: currentParagraph.addElement(tags) for formats in paragraphList: inFormat = False formatStartIndex = 0 i = 0 for elements in formats.getElements(): if "<w:pPr" in elements: inFormat = True formatStartIndex = i if "</w:pPr" in elements: inFormat = False listHolder = formats.getElements() del listHolder[formatStartIndex:i + 1] formats.clearElements() formats.setElements(listHolder) if "<w:pStyle w:val" in elements and inFormat: formats.setType(elements[elements.index('"') + 1:elements.rfind('"')]) if "<w:ilvl w:val" in elements and inFormat: formats.setListIndent(elements[elements.index('"') + 1:elements.rfind('"')]) if "<w:numId w:val" in elements and inFormat: formats.setListId(elements[elements.index('"') + 1:elements.rfind('"')]) if "<pic:nvPicPr" in elements: formats.setImageIndex(picIndex) formats.setType("Image") picIndex += 1 i += 1 for formats in paragraphList: currentElementList = [] currentText = "" inText = False for elements in formats.getElements(): if "<w:r" in elements: currentElement = Element() elif "<w:b/>" in elements: currentElement.bold = True elif "<w:strike/>" in elements: currentElement.setStrikethrough(True) elif '<w:vertAlign w:val="subscript"/>' in elements: currentElement.setSubscript(True) elif '<w:vertAlign w:val="superscript"/>' in elements: currentElement.setSuperscript(True) elif "<w:i/>" in elements: currentElement.italics = True elif "<w:u" in elements: currentElement.underline = True elif "<w:t>" in elements or '<w:t xml:space=' in elements: inText = True elif "</w:r>" in elements: inElement = False currentElementList.append(currentElement) elif "</w:t>" in elements and inText: inText = False currentElement.setText(currentText) currentText = "" elif inText: currentText = currentText + elements formats.clearElements() formats.setElements(currentElementList) return paragraphList
def CalculateFeatures(self, filename, paragraphpos, headername, text, nofpars): featurerow = filename + "," + paragraphpos + ',' + headername pg = Paragraph(text, paragraphpos, headername, filename, nofpars, self.readmedesc, self.infile) self.ParagraphList.append(pg)