Ejemplo n.º 1
0
    def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) :
        self.directory = join(top_level_directory, 'alignments')
        self.min_length = min_length # glutton
        self.min_hitidentity = min_hitidentity # blast 
        self.min_hitlength = min_hitlength # blast
        self.max_evalue = max_evalue # blast
        self.min_alignidentity = min_alignidentity # pagan
        self.min_alignoverlap = min_alignoverlap # pagan

        check_dir(self.directory, create=True)

        self.search = All_vs_all_search(batch_size)
        self.cleanup_files = []
        self.q = None

        self.lock = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.param.set_reference(self.db)

        self.resume = self.param.able_to_resume()

        self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume)
        self.param.set_full_checksum()
Ejemplo n.º 2
0
Archivo: info.py Proyecto: ajm/glutton
 def copy(self, src) :
     data_dir = join(self.directory, 'data')
     check_dir(data_dir, create=True)
     dst = join(data_dir, basename(src))
     self.log.info("cp %s %s" % (src, dst))
     shutil.copy(src, dst)
     return join('data', basename(src))
Ejemplo n.º 3
0
Archivo: info.py Proyecto: ajm/glutton
    def __init__(self, project_dir, create=False) :
        self.directory = project_dir
        self.create = create
        check_dir(self.directory, create=self.create)

        self.log = get_log()

        self.params = self.load(self.parameter_filename)

        if not self.params :
            self.params = { 'db_species'    : None,
                            'db_release'    : None,
                            'db_filename'   : None,

                            'db_checksum'   : None,
                            'full_checksum' : None,
                            'sample_checksum' : None,

                            'samples'       : {} }
Ejemplo n.º 4
0
Archivo: info.py Proyecto: ajm/glutton
    def __init__(self, alignments_dir, parameters, db, resume=True) :
        self.directory = alignments_dir
        self.params = parameters
        self.db = db

        check_dir(self.directory)

        self.log = get_log()
        self.lock = threading.RLock() # a single function requires this be an RLock over a Lock

        # the alignment procedure can take a long time, so everything needs to be 
        # restartable, in addition - if we restart it then we need to be sure that 
        # the parameters used are the same, i.e.: same reference database etc etc

        self.contig_query_map = {}          # file id -> contig id -> query id (file id is provided by the user, called a 'label')
        self.query_gene_map = {}            # query id -> (gene id, +/-) or None
        self.genefamily_filename_map = {}   # gene family id -> filename

        if resume :
            self.read_progress_files()
Ejemplo n.º 5
0
    def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') :
        self.alignments_dir     = join(top_level_directory, 'alignments')
        self.output_dir         = join(top_level_directory, 'postprocessing')
        self.protein_identity   = protein_identity
        self.alignment_length   = alignment_length
        self.min_gene_coverage  = min_gene_coverage
        self.trim               = not do_not_trim
        self.testmode           = testmode

        self.scaffold_dir       = join(self.output_dir, 'scaffolds')
        self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa')
        self.gene_msa_dir       = join(self.output_dir, 'gene_msa')

        check_dir(self.output_dir, create=True)
        check_dir(self.scaffold_dir, create=True)
        check_dir(self.genefamily_msa_dir, create=True)
        check_dir(self.gene_msa_dir, create=True)

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.info = GluttonInformation(self.alignments_dir, self.param, self.db)

        # check reference was the same
        if not self.param.same_reference(self.db) :
            self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename))
            exit(1);

        # perhaps slightly overambitious to exit, just stick to a warning      
        pending,failures = self.info.num_alignments_not_done()
        if pending != 0 :
            self.log.warn("%d alignments were not run!" % pending)

        self.assembler = AssemblerOutput(assembler_name)

        # e.g. query39806_orf1
        self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")
Ejemplo n.º 6
0
Archivo: main.py Proyecto: ajm/glutton
def generic_options(args) :
    if hasattr(args, 'database_host') and args.database_host :
        custom_database(args.database_host, 
                        args.database_port, 
                        args.database_user, 
                        args.database_password)

    # threads
    if hasattr(args, 'threads') :
        set_threads(args.threads)

    # tmpdir
    if hasattr(args, 'tmpdir') :
        try :
            check_dir(args.tmpdir, create=True)
            set_tmpdir(args.tmpdir)
        
        except OSError :
            print >> stderr, "ERROR: %s does not exist..." % args.tmpdir 
            exit(1)

    # gltfile
    if hasattr(args, 'gltfile') and args.gltfile :
        if not os.path.isfile(args.gltfile) :
            print >> stderr, "ERROR: %s does not exist..." % args.gltfile
            exit(1)

        args.output = args.gltfile

    if hasattr(args, 'gltfile') and hasattr(args, 'species') :
        if not args.gltfile and not args.species :
            print >> stderr, "ERROR: you must specify either the species or an existing GLT file..."
            exit(1)

    if hasattr(args, 'reference') and args.reference :
        if not os.path.isfile(args.reference) :
            print >> stderr, "ERROR: %s does not exist..." % args.reference
            exit(1)

    # verbosity
    if hasattr(args, 'verbose') :
        set_verbosity(args.verbose)

    # ensembl download method
    if hasattr(args, 'method') :
        set_ensembl_download_method(args.method)

    # build does not work for ensembl-genomes if the method is "sql"
    if hasattr(args, 'method') and args.method == "sql" :
        if args.database != 'ensembl' :
            print >> stderr, "ERROR: 'sql' does not work for Ensembl-genomes, use 'biomart' instead..."
            exit(1)

    # contigs, labels, species, bam files
    # contigs, labels amd bam files must be unique
    #   apart from 'FAKE' bam file
    # length of contigs, label, species must be the same
    # length of bam files is either zero or same as contigs
    if hasattr(args, 'contigs') :
        if args.contigs :
            if not args.sample or not args.species :
                print >> stderr, "ERROR: you must specify one --sample and one --species argument per --contigs file!"
                exit(1)

    if hasattr(args, 'setupcmd') :
        if args.setupcmd == 'add' and None in (args.sample, args.contigs, args.species) :
            print >> stderr, "ERROR: to add a sample, you must specify sample identifier, contig FASTA file and species name (see glutton setup -h)"
            exit(1)

        elif args.setupcmd == 'remove' and not args.sample :
            print >> stderr, "ERROR: no sample identifier found"
            exit(1)

    setup_logging()