def __init__(self, parent, id, title): wx.Frame.__init__(self, parent, id, title, size=(800, 400)) genome = Genome() genome.setSequence("") featurelist = FeatureList("test") feature1 = Feature("protein1", "protein asdgqiuw", 0, 49) feature2 = Feature("protein2", "asdgqidu", 270, 300) featurelist.addFeature(feature1) featurelist.addFeature(feature2) gff = FeatureList("gff") featuregff1 = Feature("protein1", "protein asdgqiuw", 60, 70) featuregff2 = Feature("protein1", "protein asdgqiuw", 78, 80) gff.addFeature(featuregff1) gff.addFeature(featuregff2) flc = FeatureListContainer() flc.addFeatureList(featurelist) flc.addFeatureList(gff) self.model = GenomeModel() self.model.setGenome(genome) self.model.setFeatureListContainer(flc) self.view = GenomeView(self.model, self) self.Show()
def initial_generation(self): initial_genomes = [] for i in range(self.population): g = Genome() g.solution = random_solution(self.genome_length) initial_genomes.append(g) return Generation(initial_genomes, self.answer)
def from_gen_file(file_name, old=False): """ imports a (unaligned) gen_file and returns a GenomeCompare object @params file_name: the name of the file containing the genomes old / boolean / False if this file was created before June 23rd 2015, it is likely to be in the float format. use True in that case only. """ import csv genomes = {} with open(file_name, 'r') as f: reader = csv.reader(f) for row in reader: if old: genomes[int(row[0])] = Genome.from_mutated_loci( map(float, row[2:]), mutation_rate=int(row[1]), name=int(row[0])) else: genomes[int(row[0])] = Genome.from_mutated_loci( map(int, row[2:]), mutation_rate=int(row[1]), name=int(row[0])) return GenomeCompare(genomes=genomes)
class Player: def __init__(self): assert INPUTS != 0 and OUTPUTS != 0, "You must call the initialize method before creating players!" self.fitness = -1 self.unadjustedFitness = -1 self.brain = Genome(INPUTS, OUTPUTS, False) self.vision = [] self.actions = [] self.lifespan = 0 self.dead = False self.replay = False self.gen = 0 self.name = "" self.speciesName = "Not yet defined" def update(self): # Does something that will eventually end up in death or victory # for the organism pass def look(self): # Looks at the input - This is where you should populate your vision # array pass def think(self): # Makes actions based off of input # Fill in based off of what you want to happen pass def clone(self): out = Player() out.replay = False out.fitness = self.fitness out.gen = self.gen out.brain = self.brain.clone() return out def cloneForReplay(self): out = Player() out.replay = True out.fitness = self.fitness out.brain = self.brain.clone() out.speciesName = self.speciesName def calculateFitness(self): # To return the calculated fitness at any given point in time pass def getFitness(self): if not self.replay: return self.calculateFitness() return self.fitness def crossover(self, parent2): child = Player() child.brain = self.brain.crossover(parent2.brain) child.brain.generateNetwork() return child
def regexsearch(self, regex): """search in a string with a re""" genome = Genome() container = FeatureListContainer() print genome.getSequence() flist = [] flist.append(re.findall(regex, genome.getSequence() , re.I)) print flist
def tf_idf_training(comment_cnt_lower_bound, train_ratio): # train_set, cv_set = Train.simple_partition(comment_cnt_lower_bound, \ # train_ratio) # idf_dict = Feature.cal_idf(train_set, Config.train_idf_path) train_set = Train.get_train_set() Feature.cal_tf_idf(train_set, Config.train_idf_path, Config.train_tf_idf_path, True, 200) Genome.cal_tf_idf(Config.train_tf_idf_path, \ Config.train_genome_tf_idf_path, 200)
def initializePool(env): pool = Pool(env) for i in range(0, Population): basic = Genome(pool) basic.basicGenome() pool.addToSpecies(basic) pool.initializeRun() return pool
def get_child(self, parent1, parent2): new_genome = Genome(weight_mutation=self.weight_mutation, input_nodes=self.inputs, output_nodes=self.outputs, genome_id=self.genome_id) self.genome_id += 1 fitness1 = parent1.get_fitness() fitness2 = parent2.get_fitness() if(fitness1 > fitness2): genome1 = parent1 genome2 = parent2 else: genome1 = parent2 genome2 = parent1 for conn1 in genome1.get_connection_genes(): copy_con1 = copy.deepcopy(conn1) excessGene = True disjointGene = False newConn = 0 for conn2 in genome2.get_connection_genes(): copy_con2 = copy.deepcopy(conn2) # Both have connection with this innovation number if(conn1.get_innovation_number() == conn2.get_innovation_number()): excessGene = False # The expressed parameter is not matching if(conn1.expressed != conn2.expressed): disjointGene = True # Get the deltaWeight, because everything seems to be in order else: newConn = copy_con1 if(randint(0,1) == 1) else copy_con2 if(excessGene == True): new_genome.connection_genes.append(copy_con1) elif(disjointGene == True): new_genome.connection_genes.append(copy_con1) else: new_genome.connection_genes.append(newConn) #Add node #nodeIn, nodeOut = copy.deepcopy(new_genome.connection_genes[-1].get_connected_nodes()) nodeInID, nodeOutID = copy.copy(new_genome.connection_genes[-1].get_connected_nodes_id()) #Add node if there wasnt any node like this before if(new_genome.get_node_by_id(nodeInID) == False): node = copy.deepcopy(genome1.get_node_by_id(nodeInID)) new_genome.node_genes.append(node) if(new_genome.get_node_by_id(nodeOutID) == False): node = copy.deepcopy(genome1.get_node_by_id(nodeOutID)) new_genome.node_genes.append(node) # Get the higher global node id copy_gnID1 = copy.copy(parent1.global_node_id) copy_gnID2 = copy.copy(parent2.global_node_id) new_genome.global_node_id = copy_gnID1 if(copy_gnID1>copy_gnID2) else copy_gnID2 return new_genome
def tournament(self, prev_generation): new_genome = Genome() # pick two parents parent_genome_1 = pick_parent(prev_generation) parent_genome_2 = pick_parent(prev_generation) new_genome.solution = crossover(parent_genome_1, parent_genome_2) new_genome.solution = mutate(new_genome.solution, self.mutation_rate) return new_genome
def values(self, genome: Genome) -> List[float]: objectives: List[float] = [.0 for i in range(genome.nobjectives)] objectives[0] = genome.variable(0) sumv: float = 0 for i in range(genome.nvariables): sumv += genome.variable(i) g = 1 + (9.0 / (genome.nvariables - 1)) * sumv objectives[1] = g * (1 - sqrt(objectives[0] / g)) return objectives
def evolve_generation_speciated(self): # Clean out our previous species array species_list = [] self.current_generation = {} # Find the fitness of all individuals of the previous generation, # And also assign a species to all of them for genome in self.previous_generation.values(): genome.fitness = self.fitness_function(genome) genome_added = False for species in species_list: cd = Genome.compatibility_distance(self.previous_generation[species.representor], genome, self.config.PARAM_C1, self.config.PARAM_C2, self.config.PARAM_C3) if cd <= self.config.COMPATIBILITY_THRESHOLD: species.add_genome(genome.id, genome.fitness) genome_added = True break if not genome_added: species_list.append(Species(genome.id, genome.fitness)) species_list.sort(key=lambda species: species.total_fitness, reverse=True) # Recalculate the adjusted fitness based on species for species in species_list: for genome_id in species.genomes: self.previous_generation[genome_id].fitness /= species.count_genomes # Kill off low performing individuals fitness_sorted_genome_ids = [genome[0] for genome in sorted(self.previous_generation.items(), key=lambda genome: genome[1].fitness, reverse=True)] parents = fitness_sorted_genome_ids[:int(self.config.SELECTION_RATIO * self.config.POPULATION_SIZE)] for species in species_list: for genome_id in species.genomes: if genome_id not in parents: species.genomes.remove(genome_id) species = [species for species in species_list if species] # Allocate child count per-species and create children total_fitness = reduce(lambda a, b: a + b, [s.total_fitness for s in species_list]) if total_fitness > 0.0: allocation_ratio = self.config.POPULATION_SIZE / total_fitness per_species_allocation = [(index, int(species.total_fitness * allocation_ratio)) for index, species in enumerate(species_list)] # Create offspring to fill up remaining space by random mating and mutations based on species size for index, child_count in per_species_allocation: for _ in range(child_count): parent1 = self.previous_generation[self.random.choice(species_list[index].genomes)] parent2 = self.previous_generation[self.random.choice([genome for genome in species_list[index].genomes if genome != parent1])] child = Genome.generate_offspring(parent1, self.max_id, self.random, self.node_classes, self.innovator, self.config, parent2) self.current_generation[child.id] = child present_population_size = len(self.current_generation) while present_population_size <= self.config.POPULATION_SIZE: parent = self.previous_generation[self.random.choice(parents)] child = Genome.generate_offspring(parent, self.max_id, self.random, self.node_classes, self.innovator, self.config) self.current_generation[child.id] = child present_population_size += 1 self.max_id += 1 # Update all lists and perform logging self.previous_generation = self.current_generation
def breedChild(self): child = Genome() if random.random() < CrossoverChance: g1 = self.genomes[random.randint(1, len(self.genomes))] g2 = self.genomes[random.randint(1, len(self.genomes))] child = g1.crossover(g2) else: g = self.genomes[random.randint(1, len(self.genomes))] child = g.clone() child.mutate() return child
def __init__(self): assert INPUTS != 0 and OUTPUTS != 0, "You must call the initialize method before creating players!" self.fitness = -1 self.unadjustedFitness = -1 self.brain = Genome(INPUTS, OUTPUTS, False) self.vision = [] self.actions = [] self.lifespan = 0 self.dead = False self.replay = False self.gen = 0 self.name = "" self.speciesName = "Not yet defined"
def get_clone(self, genome): # Create a child child = Genome(weight_mutation=self.weight_mutation, input_nodes=self.inputs, output_nodes=self.outputs, genome_id=self.genome_id) self.genome_id += 1 # Copy the parents genes to the child child.connection_genes = copy.deepcopy(genome.connection_genes) child.node_genes = copy.deepcopy(genome.node_genes) child.global_node_id = copy.copy(genome.global_node_id) # I dont have to mutate the child, it will be mutated all together. return child
def initialize_population(self, population=None, size=None): if population is not None: self.previous_generation = population self.max_id = len(population.keys()) elif size is not None: self.max_id = self.config.POPULATION_SIZE pop_size = self.config.POPULATION_SIZE input_size = size[0] output_size = size[1] for genome_id in range(pop_size): genome = Genome(genome_id) node_id = 0 # Input Nodes for _ in range(input_size): node_gene = self.random.choice(self.node_classes)(node_id, NodeType.INPUT) genome.add_node_gene(node_gene) node_id += 1 # Output Nodes for _ in range(output_size): node_gene = self.random.choice(self.node_classes)(node_id, NodeType.OUTPUT) genome.add_node_gene(node_gene) node_id += 1 # Connect each input to every other output for in_id in range(input_size): for out_id in range(output_size): connection = ConnectionGene(in_id, out_id + input_size, 1.0, True, self.innovator.next_innovation_number((in_id, out_id))) genome.add_connection_gene(connection) self.previous_generation[genome_id] = genome else: raise ValueError("Invalid Parameters")
def create_members(self): nkey = str(self.niecheID) self.nieches[nkey] = Nieche(self.niecheID) for i in range(self.members): # Create a new genome gkey = str(self.genome_id) self.genomes[gkey] = Genome( weight_mutation = self.weight_mutation, input_nodes = self.inputs, output_nodes = self.outputs, genome_id = self.genome_id) # Create input, output nodes, and connect them self.genomes[gkey].create_inputs() self.genomes[gkey].create_outputs() self.genomes[gkey].create_innitial_connections() # Tell which nieche this node is belongs to, at init all belongs to the same, the 0 self.genomes[gkey].set_nieche_id(self.niecheID) # At init add every member to the first nieche/species self.nieches[nkey].add_member(self.genome_id) # Increment ID self.genome_id += 1 #The newly created Genomes has init innovations, connection between outputs and inputs #which we need to group together self.group_innovations() self.niecheID += 1
def run(self, ngeneration: int, populationsize: int, crossoverrate: float, mutationrate: float, problem: ZDTOne): self._population = NonDominatingSortingPopulation() for generationcount in range(populationsize): gene: Genome = Genome(2, 2) gene.calculate_fitnesses(problem) self._population.add_genome(gene) self._population.rank() for generationcount in tqdm(range(ngeneration)): for nchildren in range(populationsize): parent1: Genome = self.tournament_selection(populationsize) parent2: Genome = self.tournament_selection(populationsize) rand = random() if (rand < crossoverrate): self.crossover(parent1, parent2) else: self._population.add_genome(parent1) self._population.add_genome(parent2) if (random() < mutationrate): self.mutate(self._population.get_genome(-1)) self.mutate(self._population.get_genome(-2)) self._population.get_genome(-1).calculate_fitnesses(problem) self._population.get_genome(-2).calculate_fitnesses(problem) self._population.rank() self._population.truncate(populationsize) return self._population
def create_asexual_genome(parent, mutation_tracker, newNodeProb=0.03, newConnectionProb=0.05, alterConnectionProb=0.8, newConnectionValueProb=0.1): new_c_genes = {} new_n_genes = {} # clone the parent for key, value in parent.n_genes.items(): new_n_genes[key] = copy.deepcopy(value) for key, value in parent.c_genes.items(): new_c_genes[key] = copy.deepcopy(value) # apply mutation to all connection for c_key in new_c_genes: if not new_c_genes[c_key].disable: if np.random.uniform(0, 1) < alterConnectionProb: new_c_genes[c_key] = alter_connection(new_c_genes[c_key], newConnectionValueProb) child_genome = Genome(parent.input_size, parent.output_size, new_n_genes, new_c_genes, parent.generation + 1, [parent.species_id]) # apply new nodes mutation if np.random.uniform(0, 1) < newNodeProb: add_node_mutation(child_genome, mutation_tracker) # apply new connection mutation if np.random.uniform(0, 1) < newConnectionProb: add_connection_mutation(child_genome, mutation_tracker) return child_genome
def simple_trial(): i = Innovator() r = Random() config = { "MUTATION_RATE": 0.0, "CONNECTION_MUTATION_RATE": 0.0, "NODE_MUTATION_RATE": 0.0, "DISABLED_GENE_INHERITING_CHANCE": 1.0, } config = DottedDict(config) nodes = 4 to_remove = 2 g1 = generate_complete_genome(1, nodes, r, i) g2 = generate_complete_genome(2, nodes, r, i) g1.fitness = 10.0 g2.fitness = 0.0 for key in r.sample(g1.connection_genes.keys(), to_remove): del g1.connection_genes[key] for key in r.sample(g2.connection_genes.keys(), to_remove): del g2.connection_genes[key] gc = Genome.generate_offspring(g1, 3, r, [TestNode], i, config, genomeB=g2) g1.vizualize_genome(1, "g1") g2.vizualize_genome(2, "g2") gc.vizualize_genome(3, "gc") plt.show()
def create_new_genome(input_size, output_size, fully_connected=False): nodes_genes = {} for i in range(0, input_size): nodes_genes[i] = NodeGene(input_nodes=None, output_nodes=[], neuron_type='i') for j in range(input_size, input_size + output_size): nodes_genes[j] = NodeGene(input_nodes=[], output_nodes=[], neuron_type='o') cpt = input_size + output_size connection_genes = dict() if fully_connected: for i in range(0, input_size): for j in range(input_size, input_size + output_size): connection_genes[i, j] = ConnectionGene(cpt, Mutation.get_new_weight(), False) nodes_genes[i].output_nodes.append(j) nodes_genes[j].input_nodes.append(i) cpt += 1 return Genome(input_size=input_size, output_size=output_size, nodes_genes=nodes_genes, connection_genes=connection_genes, generation=0, parents_species_id=[])
def genome_halving(): # Create the dictionary of genomes from the input file genomes: Dict[str, List[str]] = parse_genomes() # Get the first 2 genomes from the input file values_view: ValuesView[List[str]] = genomes.values() value_iterator: Iterator[List[str]] = iter(values_view) tetrad: List[str] = next(value_iterator) outgroup: List[str] = next(value_iterator) # Get GenomeHalving configuration options to_replace: int = config_get(CONFIG_GENOME_REPLACE) if type(to_replace) is not int: raise Exception( "Config attribute \"genome_to_replace\" needs to be a number.\n") elif to_replace not in range(0, 3): raise Exception("Genome to replace must be 0, 1, or 2.\n") # Perform Guided Genome Halving on the given tetrad and outgroup genomes ggh: GroupGraph = GroupGraph(Genome.from_strings(tetrad), Genome.from_strings(outgroup), to_replace) ggh.get_result() bpg_distance: BPGDistance = BPGDistance(Genome.from_strings(tetrad), ggh.ancestor_AA) bpg_distance.calculate_distance() distance_1: int = bpg_distance.distance bpg_distance = BPGDistance(Genome.from_strings(outgroup), ggh.ancestor_A) bpg_distance.calculate_distance() distance_2: int = bpg_distance.distance total_distance: int = distance_1 + distance_2 print( "\nd(AA, tetra) = " + str(distance_1) + " | d(A,outgroup) = " + str(distance_2), " | total = " + str(total_distance) + "\n") print("\n-\nGenome ancestor_AA:\n") for i in range(len(ggh.ancestor_AA.chromosomes)): print(str(ggh.ancestor_AA.chromosomes[i])) print("\n-\nGenome ancestor_A:\n") for i in range(len(ggh.ancestor_A.chromosomes)): print(str(ggh.ancestor_A.chromosomes[i]))
def run(self): population = [] best_genome = None data = [] for _ in range(population_total): genome_dna = generate(problem_grid) population.append(Genome(genome_dna)) for i in range(simulations): population_fitness = 0 for genome in population: genome.fitness() population_fitness += genome.getFitness() sorted_population = population.copy() best_genome = max(population, key=operator.attrgetter('fit')) best_fitness = round(1 / best_genome.getFitness()) data.append(best_fitness) if i % 1000 == 0: self.show(i, best_genome) if best_fitness <= limit: print('DONE\n') population.clear() population.append(best_genome) while len(population) < population_total: new_genome = tournamentSelection(sorted_population) option_2 = tournamentSelection(sorted_population) if npR.uniform() < crossover_rate: option_3 = tournamentSelection(sorted_population) dna_1 = crossover(new_genome, option_2, option_3) new_genome = Genome(dna_1) if npR.uniform() < mutation_rate: new_genome.mutateSell(problem_grid) population.append(new_genome) self.show(i, best_genome) return data
def movie_genome_sim(douban_id, genome_id, movie_tf_idf_path, \ genome_tf_idf_path): movie_tf_idf_dict = dict(Feature.get_tf_idf_from_file(douban_id, \ movie_tf_idf_path)) genome_tf_idf_dict = dict(Genome.get_tf_idf_from_file(genome_id, \ genome_tf_idf_path)) return Tagging.cos_sim([movie_tf_idf_dict, genome_tf_idf_dict])
def loadFile(self, filename, env): file = open(filename, "r") self.__init__(env) self.generation = int(file.readline().replace("\n", "")) self.maxFitness = int(file.readline().replace("\n", "")) #gui.settext(5, 8, maxFitnessLabel, "Max Fitness. " .. math.floor(pool.maxFitness)) numSpecies = int(file.readline().replace("\n", "")) for s in range(0, numSpecies): species = Species() self.species.append(species) species.topFitness = float(file.readline().replace("\n", "")) species.staleness = int(file.readline().replace("\n", "")) numGenomes = int(file.readline().replace("\n", "")) for g in range(0, numGenomes): genome = Genome(self) species.genomes.append(genome) genome.fitness = float(file.readline().replace("\n", "")) genome.maxneuron = int(file.readline().replace("\n", "")) line = file.readline().replace("\n", "") while line != "done": genome.mutationRates[line] = float(file.readline().replace( "\n", "")) line = file.readline().replace("\n", "") numGenes = int(file.readline().replace("\n", "")) for n in range(0, numGenes): gene = Gene() genome.genes.append(gene) enabled = 0 line = file.readline() data = [] for x in [x for i, x in enumerate(line.split(" "))]: try: data.append(int(x)) except ValueError: data.append(float(x)) gene.into, gene.out, gene.weight, gene.innovation, enabled = data gene.enabled = enabled == 1 file.close() while self.fitnessAlreadyMeasured(): self.nextGenome() self.initializeRun() self.currentFrame = self.currentFrame + 1
class UnitOfWork: _dataSet=[] _genomes=[Genome('Num_units',20,50), Genome('learning_rate',0. 0020,0.0030), Genome('lambda_loss_amount',0.0010,0.0020), Genome('Batch_size',1000,2000), Genome('Num_iterations',100,500), Genome('Segment_size',100,200), ] _popSize=10 _perMut=0.5 _iteration=22 # _test_user_ids=[2, 4, 9, 10, 12, 13, 18, 20, 24] in DataSet def __init__(self, pathDataset='datasets/uci_raw_data'): _genomes=[] self._dataSet=DataSet(pathDataset,'l')
def genome_aliquoting(): """ See the 2010 paper, section 2.5 """ raise Exception("Genome Aliquoting is not functional yet.") # Create the dictionary of genomes from the input file genomes: Dict[str, List[str]] = parse_genomes() # Get the first 2 genomes from the input file values_view: ValuesView[List[str]] = genomes.values() value_iterator: Iterator[List[str]] = iter(values_view) polyd: Genome = Genome.from_strings(next(value_iterator)) reference: List[str] = next(value_iterator) ploidy: int = count_ploidy(polyd) # Get Aliquoting configuration options to_replace: int = config_get(CONFIG_GENOME_REPLACE) if type(to_replace) is not int: raise Exception( "Config attribute \"genome_to_replace\" needs to be a number.\n") elif to_replace not in range(0, ploidy + 1): raise Exception( "Genome to replace must be non-negative and less than the number of poly genome copies (n).\n" ) # Perform Genome aliquoting on the given polyd genome alq: Aliquoting = Aliquoting(polyd, Genome.from_strings(reference), to_replace, ploidy) alq.get_result() bpg_distance: BPGDistance = BPGDistance(polyd, alq.ideal_ancestor) bpg_distance.calculate_distance() distance: int = bpg_distance.distance print("\nd(Am, polyd) = " + str(distance) + "\n") print("\n-\nGenome ancestor_A(m):\n") for i in range(len(alq.ideal_ancestor.chromosomes)): print(str(alq.ideal_ancestor.chromosomes[i]))
def apply(self, mutant_vector, target_vector, Cr): genes = [] for j in range(0, len(mutant_vector.get_genes())): randji = random.uniform(0, 1) Jrand = random.randint(0, len(mutant_vector.get_genes())) if randji <= Cr or j == Jrand: genes.append(mutant_vector.get_genes()[j]) else: genes.append(target_vector.get_genes()[j]) uig = Genome(genes) return uig
def __init__(self, psize, bounds): self.population_list = [] i = 0 while i < psize: genes = [] for j in bounds: genes.append(random.uniform(j[0], j[1])) new_genome = Genome(genes) self.population_list.append(new_genome) i = i + 1
def from_gen_file ( file_name , old = False ): """ imports a (unaligned) gen_file and returns a GenomeCompare object @params file_name: the name of the file containing the genomes old / boolean / False if this file was created before June 23rd 2015, it is likely to be in the float format. use True in that case only. """ import csv genomes = {} with open( file_name , 'r' ) as f: reader = csv.reader( f ) for row in reader: if old: genomes[ int( row[0] ) ] = Genome.from_mutated_loci( map( float , row[2:] ) , mutation_rate = int( row[1] ) , name = int( row[0] ) ) else: genomes[ int( row[0] ) ] = Genome.from_mutated_loci( map( int , row[2:] ) , mutation_rate = int( row[1] ) , name = int( row[0] ) ) return GenomeCompare( genomes = genomes )
def load(string) -> Specie: representative_str, string = remove_tag("representative", string) age_str, string = remove_tag("age", string) niche_fitness_str, string = remove_tag("niche_fitness", string) max_fitness_str, string = remove_tag("max_fitness", string) genomes_str, string = remove_tag("genomes", string) representative = Genome.load(representative_str) age = int(age_str) niche_fitness = float(niche_fitness_str) max_fitness = float(max_fitness_str) genomes = [] while genomes_str: genome_str, genomes_str = remove_tag("genome", genomes_str) genome = Genome.load(genome_str) genomes.append(genome) specie = Specie(representative, genomes, age, max_fitness) specie.niche_fitness = niche_fitness return specie
def scanForRRNA(self): # !!!!!!!!!!!!!! lHits = [] #for (sDomain, sRfam) in dDomain2Rfam.items(): # dRfam2File[sRfam].close() # sRfamModel = dDomain2Rfam[sDomain] # sFileName = '%s.%s'%(sSequenceData.split('/')[-1], sRfamModel) # if sAdditionalName != None: # sFileName = '%s.%s.%s'%(sAdditionalName, sSequenceData.split('/')[-1], sRfamModel) # os.system('cmsearch --cpu 4 --tblout %s.tblout -o %s.o --notextw %s.rfam_14_3.cm %s'%(sFileName, sFileName, sRfamModel, sSequenceData)) # lHits += ParseCm(open('%s.tblout'%(sFileName))).next() #fileOut.close() #os.system('cmpress -F rfam.ssu_rrnas.cm') #os.system('cmscan --cpu 4 --notextw rfam.ssu_rrnas.cm %s'%(sSequenceData)) #dSeqs = SeqIO(open(sSequenceData)).getDict() genome = Genome(sSequenceData) lHits = sorted(lHits, key=lambda x:(-x.score, x.eval)) fileOut = open('%s.rrnas.fasta'%(sAdditionalName),'w') dSeq2RangesUsed = {} for hit in lHits: if hit.eval > 1e-3: continue iStart, iEnd = hit.hitStart-1, hit.hitEnd sSeq = genome.seqs[genome.dSeqId2SeqIndex[hit.hitId]].seq[iStart:iEnd] if hit.strand == '-': iStart, iEnd = hit.hitEnd-1, hit.hitStart sSeq = genome.reverseComplement(genome.seqs[genome.dSeqId2SeqIndex[hit.hitId]].seq[iStart:iEnd]) bOverlapCheck = True if hit.hitId in dSeq2RangesUsed: for (iStart2, iEnd2) in dSeq2RangesUsed[hit.hitId]: iOverlap = min(iEnd, iEnd2)-max(iStart, iStart2)+1 if iOverlap >= 1: bOverlapCheck = False break if bOverlapCheck: if hit.hitId not in dSeq2RangesUsed: dSeq2RangesUsed[hit.hitId] = [] dSeq2RangesUsed[hit.hitId].append((iStart, iEnd)) fileOut.write('>%s.%s.%s.%i.%i %f %s %f\n%s\n'%(sAdditionalName, hit.queryId, hit.hitId, hit.hitStart, hit.hitEnd, hit.eval, hit.strand, hit.gc, sSeq)) fileOut.close()
def get_genome_sim_list(douban_id, movie_tf_idf_path, genome_tf_idf_path, \ genome_num=10): sim_dict = {} genome_dict = Genome.load_genome_dict() for genome_id in genome_dict: sim = Tagging.movie_genome_sim(douban_id, genome_id, \ movie_tf_idf_path, genome_tf_idf_path) sim_dict[genome_id] = sim genome_list = sorted(sim_dict.items(), key=lambda x: -x[1]) # genome_name_list = map(lambda x: (x[0], genome_dict[x[0]]['name'], \ # x[1]), genome_list) # return genome_name_list[:genome_num] return genome_list
def from_aligned_gen_file(file_name): """ imports the new format of aligned gen_file and returns a GenomeCompare object """ print 'Please use GenomeCompare2 for better speeds with aligned genome files' import csv with open(file_name, 'r') as f: reader = csv.reader(f) rownum, titles = 0, [] mapper = [] genomes = [] for row in reader: if rownum == 0: mutations_in_file = row[3:] # create a mapper of mutation objects for key, mutation in enumerate(mutations_in_file): mapper.append((key, Mutation(int(mutation)))) mapper = dict(mapper) rownum += 1 else: genome_name = row[1] mutation_rate = row[2] genome_mutations_rep = row[3:] genome_mutations_rep = map(int, map(int, genome_mutations_rep)) genome_mutations = [] for key, mutation_state in enumerate(genome_mutations_rep): # print mutation_state if mutation_state == 1: genome_mutations.append(mapper[key]) genome_to_create = Genome(mutation_rate=int(mutation_rate), name=str(genome_name)) genome_to_create.mutated_loci = genome_mutations genomes.append(genome_to_create) return GenomeCompare(genomes=genomes)
def from_aligned_gen_file ( file_name ): """ imports the new format of aligned gen_file and returns a GenomeCompare object """ print 'Please use GenomeCompare2 for better speeds with aligned genome files' import csv with open ( file_name , 'r' ) as f: reader = csv.reader( f ) rownum , titles = 0 , [] mapper = [] genomes = [] for row in reader: if rownum == 0: mutations_in_file = row[3:] # create a mapper of mutation objects for key, mutation in enumerate( mutations_in_file ): mapper.append( ( key , Mutation( int( mutation ) ) ) ) mapper = dict(mapper) rownum += 1 else: genome_name = row[1] mutation_rate = row[2] genome_mutations_rep = row[3:] genome_mutations_rep = map( int , map( int , genome_mutations_rep ) ) genome_mutations = [] for key , mutation_state in enumerate( genome_mutations_rep ): # print mutation_state if mutation_state == 1: genome_mutations.append( mapper[ key ] ) genome_to_create = Genome( mutation_rate = int( mutation_rate ) , name = str( genome_name ) ) genome_to_create.mutated_loci = genome_mutations genomes.append( genome_to_create ) return GenomeCompare( genomes = genomes )