def main(): no_threads = 8 outdir = "raw_tables" os.makedirs(outdir, exist_ok=True) db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) # unprocessed_species = check_species_done(cursor, all_species, ensembl_db_name, outdir) # print("unprocessed_species:", len(unprocessed_species)) # print("\n".join(unprocessed_species)) # exit() # all_species = unprocessed_species # all_species = ['monodelphis_domestica'] # all_species = sample(all_species, 1) cursor.close() db.close() parallelize(no_threads, store_exon_seqs_species, all_species, [ensembl_db_name, outdir])
def main(): special = 'test' no_threads = 10 method = 'usearch' if len(sys.argv) > 1 and len(sys.argv)<4: print "usage: %s <set name> <number of threads> <method>" % sys.argv[0] exit(1) # exit if bad number of cmdline params elif len(sys.argv)==4: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) method = sys.argv[3] if not (method =='usearch' or method=='sw_sharp'): print "unrecognized method: ", method exit(1) # exit if unrecognized search method # sw_sharps chokes if there is only one graphics card if method=='sw_sharp': no_threads = 1 db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize (no_threads, find_missing_exons, gene_list, [local_db, ensembl_db_name, method]) return True
def main(): no_threads = 10 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db.close() parallelize (no_threads, collect_paralogues, all_species, [local_db, ensembl_db_name]) return True
def main(): no_threads = 1 db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db .close() parallelize (no_threads, make_alignments, all_species, [local_db, ensembl_db_name]) return True
def main(): no_threads = 12 special = None if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <no of processes>" % sys.argv[0] exit(1) # after usage statement elif len(sys.argv)>=3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_processes = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print "running ", sys.argv[0] if special: print "using", special, "set" if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize (no_processes, multiple_exon_alnmt, gene_list, [local_db, ensembl_db_name]) return True
def main(): no_threads = 1 special = '' db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) species = '' if len(sys.argv) > 1 and len(sys.argv)<3 or len(sys.argv) >= 2 and sys.argv[1]=="-h": print "usage: %s <set name/species> <number of processes>" % sys.argv[0] exit(1) # after usage statement elif len(sys.argv)==3: special = sys.argv[1].lower() if special == 'none': special = None elif special in all_species: species = special no_threads = int(sys.argv[2]) print '=======================================' print sys.argv[0] if species: print species, "only" switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') parallelize_args = [no_threads, one_species_all_genes_loop, gene_ids, [local_db, ensembl_db_name, species]] elif special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) parallelize_args = [no_threads, ortologues_for_given_genes_loop, gene_list, [local_db, ensembl_db_name]] else: parallelize_args = [no_threads, all_species_all_genes_loop, all_species, [local_db, ensembl_db_name]] cursor.close() db .close() parallelize (*parallelize_args)
def main(): no_threads = 1 special = 'test' if len(sys.argv) > 1 and len(sys.argv) < 3: print "usage: %s <set name> <number of threads>" % sys.argv[0] exit(1) elif len(sys.argv) == 3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize(no_threads, exon_cleanup, gene_list, [local_db, ensembl_db_name]) return True
def main(): no_threads = 1 special = 'one' if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <number of threads> <method>" exit(1) elif len(sys.argv)==3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes that have an sw# or usearch patch" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = human_genes_w_novel_exon_orthologues (cursor, ensembl_db_name) cursor.close() db.close() parallelize (no_threads, maps_for_gene_list, gene_list[0:15000], [local_db, ensembl_db_name]) return True
def main(): no_threads = 10 special = None if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv)==3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize (no_threads, maps_for_gene_list, gene_list, [local_db, ensembl_db_name]) return True
def main(): # TODO: cplit codons no_processes = 1 rep_species = 'monodelphis_domestica' db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) # switch_to_db(cursor, ensembl_db_name[rep_species]) # qry = "select distinct(g.gene_id) from exon_map e left join gene2exon g on e.exon_id = g.exon_id" # genes_with_maps = [ret[0] for ret in hard_landing_search(cursor, qry)] # print(f"genes with maps: {len(genes_with_maps)}") # gene_list = sample(genes_with_maps, 10) gene_list = [8979] # NARS; only three exons out of 14, and not the same species as the ones for which I have the full seqeuence cursor.close() db.close() parallelize(no_processes, multiple_alignment_genes, gene_list, [rep_species, ensembl_db_name]) return True
def main(): """ Main entry point, but in reality does nothing except taking care of the parallelization. The parallelization here is per-species. """ no_threads = 1 special = '' if len(sys.argv) > 1 and len(sys.argv)<3 or len(sys.argv) >= 2 and sys.argv[1]=="-h": print "usage: %s <set name> <number of threads>" % sys.argv[0] exit(1) # after usage statment elif len(sys.argv)==3: special = sys.argv[1].lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) cursor.close() db .close() # two version of the main loop: # 1) over all species, and all genes in each speceis if not special: parallelize (no_threads, store_exon_seqs, all_species, [local_db, ensembl_db_name]) else: parallelize (no_threads, store_exon_seqs_special, gene_list, [local_db, ensembl_db_name])