コード例 #1
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser()
    parser.add_argument('input_file',
                        type=str,
                        help='Input file in evidence format')
    parser.add_argument('output_file',
                        type=str,
                        help='Path to destination corpus file')
    parser.add_argument('--labeler', type=str, help='Labeler to apply')
    options = parser.parse_args(argv[1:])

    labeler = None
    if options.labeler is None:
        log.warning('no labeler provided')
    elif options.labeler not in labelers.registry:
        labeler_names = ', '.join(sorted(labelers.registry.keys()))
        parser.error('Invalid labeler "%s"; available options are %s' %
                     (options.labeler, labeler_names))
    else:
        labeler = labelers.registry[options.labeler]

    instance_dict = load_evidence_file(options.input_file)
    num_docs = len(instance_dict)
    feature_ids = sorted(
        set(chain(*[each.iterkeys() for each in instance_dict.values()])))
    vocab_size = len(feature_ids)
    log.info('Read %d docs (vocabulary size %d) from %s' %
             (num_docs, vocab_size, options.input_file))

    log.info('Writing L2-normalized corpus to %s' % options.output_file)
    writer = CorpusWriter(options.output_file,
                          data_series='sam',
                          dim=vocab_size)

    # Create a map of feature_id => dense feature index
    feature_index = {k: i for i, k in enumerate(feature_ids)}

    # For each document, convert sparse features to dense L2-normalized feature vector and write it to the corpus
    for name, sparse_features in instance_dict.iteritems():
        doc_data = np.zeros((vocab_size, 1))
        for id, count in sparse_features.iteritems():
            doc_data[feature_index[id]] = count
        doc_data = l2_normalize(doc_data)
        doc_label = labeler(name) if labeler else None

        writer.write_doc(doc_data, name=name, label=doc_label)
    writer.close()

    wordlist_path = options.output_file + '.wordlist'
    log.info('Writing wordlist to %s' % wordlist_path)
    with open(wordlist_path, 'w') as f:
        f.writelines([s + '\n' for s in feature_ids])
コード例 #2
0
ファイル: cli.py プロジェクト: austinwaters/py-sam
def run_sam_batch(vem_configs):
    """
    Runs SAM on every experimental configuration defined by 'config'.  Jobs that have already been run or are
    current running (i.e. for which the model file already exists, or for which a lock file exists) will be skipped.
    """
    for job_settings in vem_configs:
        model_file = job_settings['model']
        if os.path.exists(model_file):
            log.warning('Model %s already exists; skipping' % os.path.basename(model_file))
            continue
        if Condorizable.is_locked(model_file):
            log.warning('Model %s is locked; check that another job isn''t writing to this path' %\
                  os.path.basename(model_file))
            continue

        VEMTask(kw=job_settings)
コード例 #3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser()
    parser.add_argument('input_file', type=str, help='Input file in evidence format')
    parser.add_argument('output_file', type=str, help='Path to destination corpus file')
    parser.add_argument('--labeler', type=str, help='Labeler to apply')
    options = parser.parse_args(argv[1:])

    labeler = None
    if options.labeler is None:
        log.warning('no labeler provided')
    elif options.labeler not in labelers.registry:
        labeler_names = ', '.join(sorted(labelers.registry.keys()))
        parser.error('Invalid labeler "%s"; available options are %s' % (options.labeler, labeler_names))
    else:
        labeler = labelers.registry[options.labeler]

    instance_dict = load_evidence_file(options.input_file)
    num_docs = len(instance_dict)
    feature_ids = sorted(set(chain(*[each.iterkeys() for each in instance_dict.values()])))
    vocab_size = len(feature_ids)
    log.info('Read %d docs (vocabulary size %d) from %s' % (num_docs, vocab_size, options.input_file))

    log.info('Writing L2-normalized corpus to %s' % options.output_file)
    writer = CorpusWriter(options.output_file, data_series='sam', dim=vocab_size)

    # Create a map of feature_id => dense feature index
    feature_index = {k:i for i, k in enumerate(feature_ids)}

    # For each document, convert sparse features to dense L2-normalized feature vector and write it to the corpus
    for name, sparse_features in instance_dict.iteritems():
        doc_data = np.zeros((vocab_size, 1))
        for id, count in sparse_features.iteritems():
            doc_data[feature_index[id]] = count
        doc_data = l2_normalize(doc_data)
        doc_label = labeler(name) if labeler else None

        writer.write_doc(doc_data, name=name, label=doc_label)
    writer.close()

    wordlist_path = options.output_file + '.wordlist'
    log.info('Writing wordlist to %s' % wordlist_path)
    with open(wordlist_path, 'w') as f:
        f.writelines([s + '\n' for s in feature_ids])
コード例 #4
0
def run_sam_batch(vem_configs):
    """
    Runs SAM on every experimental configuration defined by 'config'.  Jobs that have already been run or are
    current running (i.e. for which the model file already exists, or for which a lock file exists) will be skipped.
    """
    for job_settings in vem_configs:
        model_file = job_settings['model']
        if os.path.exists(model_file):
            log.warning('Model %s already exists; skipping' %
                        os.path.basename(model_file))
            continue
        if Condorizable.is_locked(model_file):
            log.warning('Model %s is locked; check that another job isn''t writing to this path' %\
                  os.path.basename(model_file))
            continue

        VEMTask(kw=job_settings)
コード例 #5
0
    def check_args(self, argv):
        parser = ArgumentParser()
        parser.add_argument('file_list', type=str, help='File containing list of images to process')
        parser.add_argument('dest_corpus', type=str, help='Path to write GIST corpus')
        parser.add_argument('--labeler', type=str, help='Labeler to apply')
        parser.add_argument('--color', action='store_true', help='Color GIST?')
        options = parser.parse_args(argv[1:])

        if options.labeler is None:
            log.warning('no labeler provided')
        elif options.labeler not in labelers.registry:
            labeler_names = ', '.join(sorted(labelers.registry.keys()))
            parser.error('Invalid labeler "%s"; available options are %s' % (options.labeler, labeler_names))

        if not os.path.exists(options.file_list):
            parser.error('Input file %s does not exist!' % options.file_list)

        self.add_output_file(options.dest_corpus)
        return options
コード例 #6
0
    def check_args(self, argv):
        parser = ArgumentParser()
        parser.add_argument('file_list',
                            type=str,
                            help='File containing list of images to process')
        parser.add_argument('dest_corpus',
                            type=str,
                            help='Path to write GIST corpus')
        parser.add_argument('--labeler', type=str, help='Labeler to apply')
        parser.add_argument('--color', action='store_true', help='Color GIST?')
        options = parser.parse_args(argv[1:])

        if options.labeler is None:
            log.warning('no labeler provided')
        elif options.labeler not in labelers.registry:
            labeler_names = ', '.join(sorted(labelers.registry.keys()))
            parser.error('Invalid labeler "%s"; available options are %s' %
                         (options.labeler, labeler_names))

        if not os.path.exists(options.file_list):
            parser.error('Input file %s does not exist!' % options.file_list)

        self.add_output_file(options.dest_corpus)
        return options