def main(): usage = "%prog <in-file> <part>/<parts>" description = "Takes a sequence data file, partitions it into the "\ "number of partitions given and prints out the indices of the "\ "sequences the appear in the requested partition. Specify the "\ "partition number (from 0) and total number of partitions in the "\ "form <partition-num>/<total-parts>." parser = OptionParser(usage=usage, description=description) options, arguments = parser.parse_args() if len(arguments) == 0: print >>sys.stderr, "You must specify an input data file" sys.exit(1) elif len(arguments) == 1: print >>sys.stderr, "You must give a partition specifier: <part>/<parts>" filename = os.path.abspath(arguments[0]) part, parts = arguments[1].split("/") part, parts = int(part), int(parts) # Read in the data file seqs = SequenceIndex.from_file(filename) # Partition the sequences indices = range(len(seqs)) # Use the partition function to ensure this partitioning is consistent # with all other places the sequences get partitioned all_parts = partition(indices, parts) print " ".join(["%d" % i for i in all_parts[part]])
def main(): usage = "%prog [options] <in-file>" parser = OptionParser(usage=usage) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", default=DEFAULT_PARTITIONS, help="the number of partitions to use (default: %d)" % DEFAULT_PARTITIONS) parser.add_option("--ids", dest="ids", action="store_true", help="don't output any files - just print out a list of the ids of the sequences in each partition") options, arguments = parser.parse_args() if len(arguments) == 0: print >>sys.stderr, "You must specify an input data file" sys.exit(1) filename = os.path.abspath(arguments[0]) # Read in the data file seqs = SequenceIndex.from_file(filename) part_pattern = "%s.part%%d" % filename heldout_pattern = "%s.heldout_part%%d" % filename # Divide the data up into partitions, with their complements parts = zip(partition(seqs.sequences, options.partitions), holdout_partition(seqs.sequences, options.partitions)) # Save each partition and its complement for i,(part,heldout) in enumerate(parts): if options.ids: # Just print out a list of the ids in the partition print " ".join(["%d" % s.id for s in part]) else: save_sequences(part_pattern % i, part) save_sequences(heldout_pattern % i, heldout) print >>sys.stderr, "Wrote partition %d to %s and %s" % (i,part_pattern % i,heldout_pattern % i)
def main(): usage = "%prog [options] <in-file>" parser = OptionParser(usage=usage) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", default=DEFAULT_PARTITIONS, help="the number of partitions to use (default: %d)" % DEFAULT_PARTITIONS) parser.add_option( "--ids", dest="ids", action="store_true", help= "don't output any files - just print out a list of the ids of the sequences in each partition" ) options, arguments = parser.parse_args() if len(arguments) == 0: print >> sys.stderr, "You must specify an input data file" sys.exit(1) filename = os.path.abspath(arguments[0]) # Read in the data file seqs = SequenceIndex.from_file(filename) part_pattern = "%s.part%%d" % filename heldout_pattern = "%s.heldout_part%%d" % filename # Divide the data up into partitions, with their complements parts = zip(partition(seqs.sequences, options.partitions), holdout_partition(seqs.sequences, options.partitions)) # Save each partition and its complement for i, (part, heldout) in enumerate(parts): if options.ids: # Just print out a list of the ids in the partition print " ".join(["%d" % s.id for s in part]) else: save_sequences(part_pattern % i, part) save_sequences(heldout_pattern % i, heldout) print >> sys.stderr, "Wrote partition %d to %s and %s" % ( i, part_pattern % i, heldout_pattern % i)
from apps.sequences.models import ChordSequence from django.db.models import Q from jazzparser.utils.data import holdout_partition, partition import os.path, sys NUM_PARTITIONS = 10 FILENAME = "partition" # Build a list of the sequences to put in each partition # Only include fully annotated sequences print >> sys.stderr, "Building list of fully annotated sequences" seqs = [ seq.id for seq in ChordSequence.objects.filter(analysis_omitted=False) if seq.fully_annotated ] partitions = zip(partition(seqs, NUM_PARTITIONS), holdout_partition(seqs, NUM_PARTITIONS)) for i, parts in enumerate(partitions): part, rest = parts # Output two files for each partition part_file = "%s-%d" % (FILENAME, i) held_file = "%s-%d-heldout" % (FILENAME, i) print >> sys.stderr, "Outputing partition %d to %s and %s" % (i, part_file, held_file) # Output the partition's file query = Q(id__in=part) save_pickled_data(part_file, query) # Output the rest of the data query = Q(id__in=rest) save_pickled_data(held_file, query)
from apps.sequences.datautils import save_pickled_data from apps.sequences.models import ChordSequence from django.db.models import Q from jazzparser.utils.data import holdout_partition, partition import os.path, sys NUM_PARTITIONS = 10 FILENAME = "partition" # Build a list of the sequences to put in each partition # Only include fully annotated sequences print >>sys.stderr, "Building list of fully annotated sequences" seqs = [seq.id for seq in ChordSequence.objects.filter(analysis_omitted=False) if seq.fully_annotated] partitions = zip(partition(seqs, NUM_PARTITIONS), holdout_partition(seqs, NUM_PARTITIONS)) for i,parts in enumerate(partitions): part, rest = parts # Output two files for each partition part_file = "%s-%d" % (FILENAME, i) held_file = "%s-%d-heldout" % (FILENAME, i) print >>sys.stderr, "Outputing partition %d to %s and %s" % (i, part_file, held_file) # Output the partition's file query = Q(id__in=part) save_pickled_data(part_file, query) # Output the rest of the data query = Q(id__in=rest) save_pickled_data(held_file, query)
def prepare_evaluation_options(usage=None, description=None, optparse_options=[], check_args=None, optparse_groups=[]): """ Various tasks common to the initial part of the evaluation routine scripts (C{models/eval.py}). @todo: This is not used any more. Remove it, after checking it's definitely not used. @param usage: the optparse usage string @param description: the optparse description string @type optparse_options: list of tuples @param optparse_options: (args,kwargs) pairs to add additional options to the optparse parser. @type check_args: function @param check_args: function to take the command-line arguments and check them. This will be called early in the script. Must return a tuple of (1) the model name (or model basename) that will be used in the partition model names and (2) the input filename to get sequences from. @type optparse_groups: list of pairs @param optparse_groups: specificatios for option groups to add to the optparse option parser. The first of each pair is a tuple of args to C{OptionGroup}'s init (excluding the first). The second is a list of options each formatted as C{optparse_options}. @rtype: tuple @return: (1) list of (sequences,model_name,partition_index) tuples for each partition; (2) list of lists containing the sequence ids for each partition; (3) optparse options; (4) optparse arguments. """ import sys from optparse import OptionParser, OptionGroup from jazzparser.utils.config import parse_args_with_config from jazzparser.utils.loggers import init_logging from jazzparser.data.db_mirrors import SequenceIndex from jazzparser.utils.data import partition parser = OptionParser(usage=usage, description=description) group = OptionGroup(parser, "Input", "Input data and partitioning for evaluation") group.add_option("-s", "--sequence", dest="sequence", action="store", help="limit the evaluation to just one sequence, with the given index in the input file") group.add_option("--partition", dest="partition", action="store", help="restrict to only one partition of the data. Specify as i/n, where i is the partition number and n the total number of partitions.") group.add_option("-p", "--partitions", dest="partitions", type="int", action="store", help="test on all n partitions of the data, using a different model for each. Will look for a model <NAME>i, where <NAME> is the given model name and i the partition number.") parser.add_option_group(group) parser.add_option("--debug", dest="debug", action="store_true", help="show debugging output") # Add the options according to their specs for args,kwargs in optparse_options: parser.add_option(*args, **kwargs) # Add groups and their options for group_args,options in optparse_groups: # Check whether the group already exists same_titles = [g for g in parser.option_groups if g.title == group_args[0]] if same_titles: group = same_titles[0] else: group = OptionGroup(parser, *group_args) parser.add_option_group(group) # Add options to this group for args,kwargs in options: group.add_option(*args, **kwargs) options, arguments = parse_args_with_config(parser) if check_args is None: raise ValueError, "could not check arguments and get model "\ "name. check_args must not be None" model_name,input_filename = check_args(arguments) if options.debug: # Set the log level to debug and do the standard logging init init_logging(logging.DEBUG) else: init_logging() # Load up sequences seqs = SequenceIndex.from_file(input_filename) def _get_seq_by_index(index): seq = seqs.sequence_by_index(index) if seq is None: print >>sys.stderr, "There are only %d sequences" % len(seqs) sys.exit(1) return seq ################ Data partitioning #################### if options.partitions is not None: # Divide the data up into n partitions and use a different model name for each total_parts = options.partitions print >>sys.stderr, "Cross validation: dividing test data into %d partitions" % total_parts partitions = [(part,"%s%d" % (model_name,i), i) for i,part in enumerate(partition(seqs.sequences, total_parts))] part_ids = partition(seqs.ids, total_parts) elif options.partition is not None: # Just select one partition # Split up the argument to get two integers parti,total_parts = options.partition.split("/") parti,total_parts = int(parti), int(total_parts) print >>sys.stderr, "Restricting sequences to %d-way partition %d" % (total_parts,parti) # Get a list of sequence indices to restrict our set to part_ids = partition(seqs.ids, total_parts)[parti] partitions = [ [(part,"%s%d" % (model_name,i), i) for i,part in enumerate(partition(seqs.sequences, total_parts))][parti] ] elif options.sequence is not None: # Just select one sequence seq = _get_seq_by_index(int(options.sequence)) partitions = [( [seq], model_name, 0 )] part_ids = [seq.id] else: # Don't partition the sequences partitions = [(seqs.sequences, model_name,0)] part_ids = [None] return partitions,part_ids,options,arguments
num_inputs = len(input_data) # Fill the progress record with names and mark as incomplete completed_parses = dict([(name,False) \ for name in input_data.get_identifiers()]) if partitions > 1: if options.sequence_partitions is not None: # Split the inputs up into partitions on the basis of # an even partitioning of chord sequences # This can only be done with if not isinstance(input_data, SegmentedMidiBulkInput): logger.error("option --sequence-partitions is only "\ "valid with bulk midi input data") return 1 chord_seqs = DbBulkInput.from_file(options.sequence_partitions) # Partition the chord sequences: we only need indices seq_indices = enumerate(partition( [i for i in range(len(chord_seqs))], partitions)) seq_partitions = dict( sum([[(index,part_num) for index in part] for (part_num,part) in seq_indices], []) ) # Associate a partition num with each midi input partition_numbers = [ seq_partitions[midi.sequence_index] for midi in input_data] else: # Prepare a list of partition numbers to append to model names partition_numbers = sum([ [partnum for i in part] for (partnum,part) in \ enumerate(partition(range(num_inputs), partitions))], []) else: # Otherwise, there's just one input input_list = [input_data] num_inputs = 1