def generate(output, dim):
    np.random.seed(0)
    feats = ['rand%d' % i for i in range(dim)]
    phrases = list(get_all_document_features(include_unigrams=True))
    vectors = np.random.random((len(phrases), dim))

    v = DenseVectors(pd.DataFrame(vectors, index=phrases, columns=feats))
    v.to_tsv(output, dense_hd5=True)
Esempio n. 2
0
def compose_and_write_vectors(unigram_vectors, short_vector_dataset_name, composer_classes, remove_pos= False,
                              pretrained_Baroni_composer_file=None, pretrained_Guevara_composer_file=None,
                              pretrained_Gref_composer_file=None, categorical_vector_matrix_file=None,
                              output_dir='.', gzipped=True, dense_hd5=False,
                              row_filter=default_row_filter):
    """
    Extracts all composable features from a labelled classification corpus and dumps a composed vector for each of them
    to disk. The output file will also contain all unigram vectors that were passed in, and only unigrams!
    :param unigram_vectors: a file in Byblo events format that contain vectors for all unigrams OR
    a Vectors object. This will be used in the composition process.
    :type unigram_vectors: str or Vectors
    :param classification_corpora: Corpora to extract features from. Dict {corpus_path: conf_file}
    :param pretrained_Baroni_composer_file: path to pre-trained Baroni AN/NN composer file
    :param output_dir:
    :param composer_classes: what composers to use
    :type composer_classes: list
    """

    phrases_to_compose = get_all_document_features(remove_pos=remove_pos)
    # if this isn't a Vectors object assume it's the name of a file containing vectors and load them
    if not isinstance(unigram_vectors, Vectors):
        # ensure there's only unigrams in the set of unigram vectors
        # composers do not need any ngram vectors contain in this file, they may well be
        # observed ones
        unigram_vectors = Vectors.from_tsv(unigram_vectors,
                                           row_filter=row_filter)
        logging.info('Starting composition with %d unigram vectors', len(unigram_vectors))

    # doing this loop in parallel isn't worth it as pickling or shelving `vectors` is so slow
    # it negates any gains from using multiple cores
    for composer_class in composer_classes:
        if composer_class == BaroniComposer:
            assert pretrained_Baroni_composer_file is not None
            composer = BaroniComposer(unigram_vectors, pretrained_Baroni_composer_file)
        elif composer_class == GuevaraComposer:
            assert pretrained_Guevara_composer_file is not None
            composer = GuevaraComposer(unigram_vectors, pretrained_Guevara_composer_file)
        elif composer_class == GrefenstetteMultistepComposer:
            assert pretrained_Gref_composer_file is not None
            composer = GrefenstetteMultistepComposer(unigram_vectors, pretrained_Gref_composer_file)
        elif composer_class in [CopyObject, FrobeniusAdd, FrobeniusMult]:
            composer = composer_class(categorical_vector_matrix_file, unigram_vectors)
        else:
            composer = composer_class(unigram_vectors)

        try:
            # compose_all returns all unigrams and composed phrases
            mat, cols, rows = composer.compose_all(phrases_to_compose)

            events_path = os.path.join(output_dir,
                                       'composed_%s_%s.events.filtered.strings' % (short_vector_dataset_name,
                                                                                   composer.name))
            if dense_hd5:
                write_vectors_to_hdf(mat, rows, cols, events_path)
            else:
                rows2idx = {i: DocumentFeature.from_string(x) for (x, i) in rows.items()}
                write_vectors_to_disk(mat.tocoo(), rows2idx, cols, events_path,
                                      entry_filter=lambda x: x.type in {'AN', 'NN', 'VO', 'SVO', '1-GRAM'},
                                      gzipped=gzipped)
        except ValueError as e:
            logging.error('RED ALERT, RED ALERT')
            logging.error(e)
            continue