def solve(par): N, K = par results = [] for i in range(1 << N): b = Bits(int=i, length=N + 1) if b.count(1) == K: results.append(b.bin[1:]) return '\n'.join(results)
def solve(par): N, K = par results = [] for i in range(1 << N): b = Bits(int=i, length=N + 1) if b.count(1) == K: results.append(b.bin[1:]) return '\n'.join(results)
def calc_hash(mu, num): bits = None if isinstance(mu, str): bits = Bits(bytes=mu.encode()) elif isinstance(mu, Bits): bits = mu bits += bits_from_uint(num) ones = bits.count(1) return bits_from_uint(ones), ones
def make_exon_alignment(cursor, ensembl_db_name, human_exon_id, human_exon_known, mitochondrial, min_similarity, flank_length, first_human_exon = True): sequence_pep = {} sequence_dna = {} shortest_l = -1 # Uninitialized leading padding length shortest_r = -1 # Uninitialized trailing padding length pep_aln_length = 0 dna_aln_length = 0 # find all other exons that map to the human exon maps = get_maps(cursor, ensembl_db_name, human_exon_id, human_exon_known) maps = filter (lambda m: not m.exon_id_2 is None, maps) maps_sw = filter (lambda m: m.source=='sw_sharp' or m.source=='usearch', maps) for map in maps: if map.similarity < min_similarity: continue # get the raw (unaligned) sequence for the exon that maps onto human exon_seqs = get_exon_seqs(cursor, map.exon_id_2, map.exon_known_2, ensembl_db_name[map.species_2]) if (not exon_seqs): #print " exon_seqs for" , map.source continue [pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs[1:] # rpl11 starts with an exon that translates into 2 aa's, # rpl10A has a single methionine (or so they say) followed by a split codon # *supposedly there is evidence at the protein level # but will this give me tons of junk elsewhere? ... pepseq_noX = pepseq.replace ('X','') if len(pepseq_noX)<3: # if this is the first exon, and if it starts with M, we'll let it off the hook # abd then if it's human, we'll also salvage it at any price if first_human_exon and pepseq_noX[0] == 'M' or map.species_2=='homo_sapiens': pass else: continue # check dnaseq = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if (not pepseq == pepseq2): continue # inflate the compressed sequence if not map.bitmap: continue bs = Bits(bytes=map.bitmap) if (not bs.count(1) == len(pepseq)): continue # check bitmap has correct number of 1s usi = iter(pepseq) #reconst_pepseq = "".join(('-' if c=='0' else next(usi) for c in bs.bin)) reconst_pepseq = '' for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) # come up with a unique name for this sequence species = map.species_2 # let's also have the start in gene here - might make our lives easier later exon2 = get_exon (cursor, map.exon_id_2, map.exon_known_2, ensembl_db_name[species]) sequence_name = species + "_" + str(map.exon_id_2)+"_"+str(map.exon_known_2)+"_"+str(exon2.start_in_gene) if reconst_pepseq: sequence_pep[sequence_name] = reconst_pepseq pep_aln_length = len(reconst_pepseq) reconst_ntseq = expand_pepseq (reconst_pepseq, exon_seqs[1:], flank_length) if reconst_ntseq: sequence_dna[sequence_name] = reconst_ntseq dna_aln_length = len(reconst_ntseq) # strip common gaps sequence_stripped_pep = strip_gaps (sequence_pep) if not sequence_stripped_pep: c=inspect.currentframe() #print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno) return ['',''] # strip common gaps sequence_stripped_dna = strip_gaps (sequence_dna) if not sequence_stripped_dna: c=inspect.currentframe() #print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno) return ['', ''] return [sequence_stripped_pep, sequence_stripped_dna]
def make_exon_alignment(cursor, ensembl_db_name, human_exon_id, human_exon_known, mitochondrial, min_similarity, flank_length): sequence_pep = {} sequence_dna = {} shortest_l = -1 # Uninitialized leading padding length shortest_r = -1 # Uninitialized trailing padding length pep_aln_length = 0 dna_aln_length = 0 # find all other exons that map to the human exon maps = get_maps(cursor, ensembl_db_name, human_exon_id, human_exon_known) maps = filter (lambda m: not m.exon_id_2 is None, maps) maps_sw = filter (lambda m: m.source=='sw_sharp' or m.source=='usearch', maps) for map in maps: if map.similarity < min_similarity: continue # get the raw (unaligned) sequence for the exon that maps onto human exon_seqs = get_exon_seqs(cursor, map.exon_id_2, map.exon_known_2, ensembl_db_name[map.species_2]) if (not exon_seqs): print " exon_seqs for" , map.source exit(1) continue [pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs[1:] if len(pepseq)<3: continue pepseq_noX = pepseq.replace ('X','') if len(pepseq_noX)<3: continue # check dnaseq = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if (not pepseq == pepseq2): continue # inflate the compressed sequence if not map.bitmap: continue bs = Bits(bytes=map.bitmap) if (not bs.count(1) == len(pepseq)): continue # check bitmap has correct number of 1s usi = iter(pepseq) #reconst_pepseq = "".join(('-' if c=='0' else next(usi) for c in bs.bin)) reconst_pepseq = '' for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) # come up with a unique name for this sequence species = map.species_2 sequence_name = species + "_" + str(map.exon_id_2)+"_"+str(map.exon_known_2) if reconst_pepseq: sequence_pep[sequence_name] = reconst_pepseq pep_aln_length = len(reconst_pepseq) reconst_ntseq = expand_pepseq (reconst_pepseq, exon_seqs[1:], flank_length) if reconst_ntseq: sequence_dna[sequence_name] = reconst_ntseq dna_aln_length = len(reconst_ntseq) # strip common gaps sequence_stripped_pep = strip_gaps (sequence_pep) if not sequence_stripped_pep: c=inspect.currentframe() print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno) exit(1) # strip common gaps sequence_stripped_dna = strip_gaps (sequence_dna) if not sequence_stripped_dna: c=inspect.currentframe() print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno) exit(1) return [sequence_stripped_pep, sequence_stripped_dna]
def main(): no_threads = 1 special = None if len(sys.argv) > 1 and len(sys.argv) < 3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv) == 3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species(cursor) species = 'homo_sapiens' switch_to_db(cursor, ensembl_db_name[species]) if special: print "using", special, "set" gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) incomplete = 0 genes_checked = 0 #for gene_id in gene_list: #for gene_id in [743609]: for sampling_count in range(1000): gene_id = choice(gene_list) genes_checked += 1 with_map = 0 tot = 0 switch_to_db(cursor, ensembl_db_name['homo_sapiens']) print gene2stable(cursor, gene_id), get_description(cursor, gene_id) # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) human_exons.sort(key=lambda exon: exon.start_in_gene) has_a_map = False for human_exon in human_exons: if (not human_exon.is_canonical or not human_exon.is_coding): continue if verbose: print print "\t human", human_exon.exon_id, human_exon.is_known print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens']) print "\t checking maps ..." maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) tot += 1 if maps: has_a_map = True with_map += 1 #print "ok" else: print "no maps for exon", human_exon.exon_id continue if verbose: for map in maps: species = map.species_2 exon = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq( cursor, exon, ensembl_db_name[species]) if (map.similarity): print "\t", species, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print if not tot == with_map: print "#### gene id: %d total exons: %d with map: %d ( = %d%%) " % \ (gene_id, tot, with_map, int(float(with_map)/tot*100) ) incomplete += 1 print "genes checked: %d, incomplete: %d" % (genes_checked, incomplete) cursor.close() db.close() print tot, with_map
def main(): no_threads = 1 special = None if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv)==3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) incomplete = 0 genes_checked = 0 #for gene_id in gene_list: #for gene_id in [743609]: for sampling_count in range(1000): gene_id = choice(gene_list) genes_checked += 1 with_map = 0 tot = 0 switch_to_db (cursor, ensembl_db_name['homo_sapiens']) print gene2stable(cursor, gene_id), get_description (cursor, gene_id) # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) human_exons.sort(key=lambda exon: exon.start_in_gene) has_a_map = False for human_exon in human_exons: if (not human_exon.is_canonical or not human_exon.is_coding): continue if verbose: print print "\t human", human_exon.exon_id, human_exon.is_known print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens']) print "\t checking maps ..." maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) tot += 1 if maps: has_a_map = True with_map += 1 #print "ok" else: print"no maps for exon", human_exon.exon_id continue if verbose: for map in maps: species = map.species_2 exon = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq(cursor, exon, ensembl_db_name[species]) if ( map.similarity): print "\t", species, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print if not tot== with_map: print "#### gene id: %d total exons: %d with map: %d ( = %d%%) " % \ (gene_id, tot, with_map, int(float(with_map)/tot*100) ) incomplete += 1 print "genes checked: %d, incomplete: %d" % (genes_checked, incomplete) cursor.close() db.close() print tot, with_map