Exemple #1
0
def solve(par):
    N, K = par
    results = []
    for i in range(1 << N):
        b = Bits(int=i, length=N + 1)
        if b.count(1) == K:
            results.append(b.bin[1:])
    return '\n'.join(results)
def solve(par):
    N, K = par
    results = []
    for i in range(1 << N):
        b = Bits(int=i, length=N + 1)
        if b.count(1) == K:
            results.append(b.bin[1:])
    return '\n'.join(results)
Exemple #3
0
def calc_hash(mu, num):
    bits = None
    if isinstance(mu, str):
        bits = Bits(bytes=mu.encode())
    elif isinstance(mu, Bits):
        bits = mu
    bits += bits_from_uint(num)
    ones = bits.count(1)
    return bits_from_uint(ones), ones
def make_exon_alignment(cursor, ensembl_db_name, human_exon_id, human_exon_known, mitochondrial, 
                        min_similarity,  flank_length, first_human_exon = True):

    sequence_pep = {}
    sequence_dna = {}
    shortest_l = -1 # Uninitialized  leading padding length
    shortest_r = -1 # Uninitialized trailing padding length

    pep_aln_length = 0
    dna_aln_length = 0
    # find all other exons that map to the human exon
    maps    = get_maps(cursor, ensembl_db_name, human_exon_id, human_exon_known)
    maps    = filter (lambda m: not m.exon_id_2 is None, maps)
    maps_sw = filter (lambda m: m.source=='sw_sharp' or m.source=='usearch', maps)

    for map in maps:

        if map.similarity < min_similarity: continue
        # get the raw (unaligned) sequence for the exon that maps onto human
        exon_seqs = get_exon_seqs(cursor, map.exon_id_2, map.exon_known_2, ensembl_db_name[map.species_2])
        if (not exon_seqs):
            #print " exon_seqs for" , map.source
            continue
        [pepseq, pepseq_transl_start, 
         pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs[1:]

        # rpl11 starts with an exon that translates into 2 aa's,
        # rpl10A has a single methionine (or so they say) followed by a split codon
        # *supposedly there is evidence at the protein level
        # but will this give me tons of junk elsewhere? ...
        pepseq_noX = pepseq.replace ('X','')
        if  len(pepseq_noX)<3:
            # if this is the first exon, and if it starts with M, we'll let it off the hook
            # abd then if it's human, we'll also salvage it at any price
            if first_human_exon and pepseq_noX[0] == 'M' or map.species_2=='homo_sapiens': 
                pass
            else:
                continue 
       
        # check
        dnaseq  = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna)
        if (mitochondrial):
            pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring()
        else:
            pepseq2 = dnaseq.translate().tostring()
        

        if (not pepseq == pepseq2):
            continue
            
        # inflate the compressed sequence
        if not map.bitmap:
            continue

        bs = Bits(bytes=map.bitmap)
        if (not bs.count(1) == len(pepseq)): continue # check bitmap has correct number of 1s
        usi = iter(pepseq)
        #reconst_pepseq = "".join(('-' if c=='0' else next(usi) for c in bs.bin))
        reconst_pepseq = ''
        for c in bs.bin:
            if c == '0': reconst_pepseq += '-'
            else:        reconst_pepseq += next(usi)

        # come up with a unique name for this sequence
        species       = map.species_2
        # let's also have the start in gene here - might make our lives easier later
        exon2 = get_exon (cursor, map.exon_id_2, map.exon_known_2, ensembl_db_name[species])
        sequence_name = species + "_" + str(map.exon_id_2)+"_"+str(map.exon_known_2)+"_"+str(exon2.start_in_gene)


        if reconst_pepseq: 
            sequence_pep[sequence_name] = reconst_pepseq
            pep_aln_length = len(reconst_pepseq)

            reconst_ntseq = expand_pepseq (reconst_pepseq, exon_seqs[1:], flank_length)
            if reconst_ntseq: 
                sequence_dna[sequence_name] = reconst_ntseq
                dna_aln_length = len(reconst_ntseq)

    # strip common gaps
    sequence_stripped_pep = strip_gaps (sequence_pep)
    if not sequence_stripped_pep:  
        c=inspect.currentframe()
        #print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno)
        return ['','']
    # strip common gaps
    sequence_stripped_dna = strip_gaps (sequence_dna)
    if not sequence_stripped_dna:  
        c=inspect.currentframe()
        #print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno)
        return ['', '']

    return [sequence_stripped_pep, sequence_stripped_dna]
def make_exon_alignment(cursor, ensembl_db_name, human_exon_id, human_exon_known, mitochondrial, 
                        min_similarity,  flank_length):

    sequence_pep = {}
    sequence_dna = {}
    shortest_l = -1 # Uninitialized  leading padding length
    shortest_r = -1 # Uninitialized trailing padding length

    pep_aln_length = 0
    dna_aln_length = 0
    # find all other exons that map to the human exon
    maps    = get_maps(cursor, ensembl_db_name, human_exon_id, human_exon_known)
    maps    = filter (lambda m: not m.exon_id_2 is None, maps)
    maps_sw = filter (lambda m: m.source=='sw_sharp' or m.source=='usearch', maps)

    for map in maps:

        if map.similarity < min_similarity: continue
        # get the raw (unaligned) sequence for the exon that maps onto human
        exon_seqs = get_exon_seqs(cursor, map.exon_id_2, map.exon_known_2, ensembl_db_name[map.species_2])
        if (not exon_seqs):
            print " exon_seqs for" , map.source
            exit(1)
            continue
        [pepseq, pepseq_transl_start, 
         pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs[1:]

        if     len(pepseq)<3: continue
        pepseq_noX = pepseq.replace ('X','')
        if len(pepseq_noX)<3: continue
       

        # check
        dnaseq  = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna)
        if (mitochondrial):
            pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring()
        else:
            pepseq2 = dnaseq.translate().tostring()
        

        if (not pepseq == pepseq2):
            continue
            
        # inflate the compressed sequence
        if not map.bitmap:
            continue

        bs = Bits(bytes=map.bitmap)
        if (not bs.count(1) == len(pepseq)): continue # check bitmap has correct number of 1s
        usi = iter(pepseq)
        #reconst_pepseq = "".join(('-' if c=='0' else next(usi) for c in bs.bin))
        reconst_pepseq = ''
        for c in bs.bin:
            if c == '0': reconst_pepseq += '-'
            else:        reconst_pepseq += next(usi)

        # come up with a unique name for this sequence
        species       = map.species_2
        sequence_name = species + "_" + str(map.exon_id_2)+"_"+str(map.exon_known_2)

        if reconst_pepseq: 
            sequence_pep[sequence_name] = reconst_pepseq
            pep_aln_length = len(reconst_pepseq)

            reconst_ntseq = expand_pepseq (reconst_pepseq, exon_seqs[1:], flank_length)
            if reconst_ntseq: 
                sequence_dna[sequence_name] = reconst_ntseq
                dna_aln_length = len(reconst_ntseq)

    # strip common gaps
    sequence_stripped_pep = strip_gaps (sequence_pep)
    if not sequence_stripped_pep:  
        c=inspect.currentframe()
        print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno)
        exit(1)
    # strip common gaps
    sequence_stripped_dna = strip_gaps (sequence_dna)
    if not sequence_stripped_dna:  
        c=inspect.currentframe()
        print " in %s:%d" % ( c.f_code.co_filename, c.f_lineno)
        exit(1)

    return [sequence_stripped_pep, sequence_stripped_dna]
def main():

    no_threads = 1
    special = None

    if len(sys.argv) > 1 and len(sys.argv) < 3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv) == 3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species(cursor)
    species = 'homo_sapiens'
    switch_to_db(cursor, ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special)
    else:
        print "using all protein coding genes"
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1)

    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list:
    #for gene_id in [743609]:
    for sampling_count in range(1000):

        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot = 0
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        print gene2stable(cursor, gene_id), get_description(cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or not human_exon.is_coding):
                continue
            if verbose:
                print
                print "\t human", human_exon.exon_id, human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon,
                                             ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id,
                            human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print "no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species = map.species_2
                    exon = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(
                        cursor, exon, ensembl_db_name[species])
                    if (map.similarity):
                        print "\t", species, map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ", map.exon_id_1, map.exon_known_1
                        print "\tsim", map.similarity,
                        print "\tsource", map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)):
                                print "\talnd seq mismatch"

                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else: reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot == with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d" % (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map
def main():


    no_threads = 1
    special    = None

    if len(sys.argv) > 1 and  len(sys.argv)<3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv)==3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    species                        = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        
    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list: 
    #for gene_id in [743609]: 
    for sampling_count in range(1000):
 
        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot      = 0
        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or  not human_exon.is_coding): continue
            if verbose:
                print  
                print "\t human",   human_exon.exon_id,  human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print"no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species            = map.species_2
                    exon               = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(cursor, exon, ensembl_db_name[species])
                    if ( map.similarity):
                        print "\t", species,  map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ",  map.exon_id_1, map.exon_known_1
                        print "\tsim",  map.similarity,
                        print "\tsource",  map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)): 
                                print "\talnd seq mismatch"
                            
                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else:        reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot== with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d"  %  (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map