def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) : self.directory = join(top_level_directory, 'alignments') self.min_length = min_length # glutton self.min_hitidentity = min_hitidentity # blast self.min_hitlength = min_hitlength # blast self.max_evalue = max_evalue # blast self.min_alignidentity = min_alignidentity # pagan self.min_alignoverlap = min_alignoverlap # pagan check_dir(self.directory, create=True) self.search = All_vs_all_search(batch_size) self.cleanup_files = [] self.q = None self.lock = threading.Lock() self.complete_jobs = 0 self.total_jobs = 0 self.log = get_log() self.param = GluttonParameters(top_level_directory) self.db = GluttonDB(reference_fname) self.param.set_reference(self.db) self.resume = self.param.able_to_resume() self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume) self.param.set_full_checksum()
def copy(self, src) : data_dir = join(self.directory, 'data') check_dir(data_dir, create=True) dst = join(data_dir, basename(src)) self.log.info("cp %s %s" % (src, dst)) shutil.copy(src, dst) return join('data', basename(src))
def __init__(self, project_dir, create=False) : self.directory = project_dir self.create = create check_dir(self.directory, create=self.create) self.log = get_log() self.params = self.load(self.parameter_filename) if not self.params : self.params = { 'db_species' : None, 'db_release' : None, 'db_filename' : None, 'db_checksum' : None, 'full_checksum' : None, 'sample_checksum' : None, 'samples' : {} }
def __init__(self, alignments_dir, parameters, db, resume=True) : self.directory = alignments_dir self.params = parameters self.db = db check_dir(self.directory) self.log = get_log() self.lock = threading.RLock() # a single function requires this be an RLock over a Lock # the alignment procedure can take a long time, so everything needs to be # restartable, in addition - if we restart it then we need to be sure that # the parameters used are the same, i.e.: same reference database etc etc self.contig_query_map = {} # file id -> contig id -> query id (file id is provided by the user, called a 'label') self.query_gene_map = {} # query id -> (gene id, +/-) or None self.genefamily_filename_map = {} # gene family id -> filename if resume : self.read_progress_files()
def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') : self.alignments_dir = join(top_level_directory, 'alignments') self.output_dir = join(top_level_directory, 'postprocessing') self.protein_identity = protein_identity self.alignment_length = alignment_length self.min_gene_coverage = min_gene_coverage self.trim = not do_not_trim self.testmode = testmode self.scaffold_dir = join(self.output_dir, 'scaffolds') self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa') self.gene_msa_dir = join(self.output_dir, 'gene_msa') check_dir(self.output_dir, create=True) check_dir(self.scaffold_dir, create=True) check_dir(self.genefamily_msa_dir, create=True) check_dir(self.gene_msa_dir, create=True) self.log = get_log() self.param = GluttonParameters(top_level_directory) self.db = GluttonDB(reference_fname) self.info = GluttonInformation(self.alignments_dir, self.param, self.db) # check reference was the same if not self.param.same_reference(self.db) : self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename)) exit(1); # perhaps slightly overambitious to exit, just stick to a warning pending,failures = self.info.num_alignments_not_done() if pending != 0 : self.log.warn("%d alignments were not run!" % pending) self.assembler = AssemblerOutput(assembler_name) # e.g. query39806_orf1 self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")
def generic_options(args) : if hasattr(args, 'database_host') and args.database_host : custom_database(args.database_host, args.database_port, args.database_user, args.database_password) # threads if hasattr(args, 'threads') : set_threads(args.threads) # tmpdir if hasattr(args, 'tmpdir') : try : check_dir(args.tmpdir, create=True) set_tmpdir(args.tmpdir) except OSError : print >> stderr, "ERROR: %s does not exist..." % args.tmpdir exit(1) # gltfile if hasattr(args, 'gltfile') and args.gltfile : if not os.path.isfile(args.gltfile) : print >> stderr, "ERROR: %s does not exist..." % args.gltfile exit(1) args.output = args.gltfile if hasattr(args, 'gltfile') and hasattr(args, 'species') : if not args.gltfile and not args.species : print >> stderr, "ERROR: you must specify either the species or an existing GLT file..." exit(1) if hasattr(args, 'reference') and args.reference : if not os.path.isfile(args.reference) : print >> stderr, "ERROR: %s does not exist..." % args.reference exit(1) # verbosity if hasattr(args, 'verbose') : set_verbosity(args.verbose) # ensembl download method if hasattr(args, 'method') : set_ensembl_download_method(args.method) # build does not work for ensembl-genomes if the method is "sql" if hasattr(args, 'method') and args.method == "sql" : if args.database != 'ensembl' : print >> stderr, "ERROR: 'sql' does not work for Ensembl-genomes, use 'biomart' instead..." exit(1) # contigs, labels, species, bam files # contigs, labels amd bam files must be unique # apart from 'FAKE' bam file # length of contigs, label, species must be the same # length of bam files is either zero or same as contigs if hasattr(args, 'contigs') : if args.contigs : if not args.sample or not args.species : print >> stderr, "ERROR: you must specify one --sample and one --species argument per --contigs file!" exit(1) if hasattr(args, 'setupcmd') : if args.setupcmd == 'add' and None in (args.sample, args.contigs, args.species) : print >> stderr, "ERROR: to add a sample, you must specify sample identifier, contig FASTA file and species name (see glutton setup -h)" exit(1) elif args.setupcmd == 'remove' and not args.sample : print >> stderr, "ERROR: no sample identifier found" exit(1) setup_logging()