def test_loading_from_h5(): t1 = Vectors.from_tsv('discoutils/tests/resources/exp0-0a.strings') t2 = Vectors.from_tsv('discoutils/tests/resources/exp0-0a.strings.h5') for k in t1.keys(): assert k in t2 v1 = t1.get_vector(k) v2 = t2.get_vector(k) np.testing.assert_array_equal(v1.A, v2.A)
def train_verb_tensors(svos_file, noun_vectors_file, output_filename): """ Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3) :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py` :param noun_vectors_file: a vector store containing noun vectors :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus """ mkdirs_if_not_exists(os.path.dirname(output_filename)) v = Vectors.from_tsv(noun_vectors_file) with open(svos_file) as infile: phrases = set() for line in infile: if DocumentFeature.from_string(line.strip()).type == 'SVO': phrases.add(tuple(line.strip().split('_'))) phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v] phrases = sorted(phrases, key=itemgetter(1)) logging.info('Found %d SVOs in list', len(phrases)) verb_tensors = dict() for verb, svos in groupby(phrases, itemgetter(1)): svos = list(svos) if len(svos) < MIN_SVO_PER_VERB: continue logging.info('Training matrix for %s from %d SVOs', verb, len(svos)) vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos) verb_tensors[verb] = vt logging.info('Trained %d verb matrices, saving...', len(verb_tensors)) for verb, tensor in verb_tensors.items(): df = pd.DataFrame(tensor) df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
def _do_feature_selection(must_be_in_thesaurus, k, handler='Base', vector_source='default', max_feature_len=1, delete_kid=False): """ Loads a data set, vectorizes it by extracting n-grams (default n=1) using a feature handler (default BaseFeatureHandler) and then performs feature selection based on either a vector source or on chi2 scores. Returns the encode/decode matrices and the stripped vocabulary of the Vectorizer after feature selection. The vector source by default has a unigrams source that covers all unigrams in the training set (feature vectors are made up), and does not know about n-grams. Optionally, another vector source can be passed in. """ handler_pattern = 'eval.pipeline.feature_handlers.{}FeatureHandler' raw_data, data_ids = load_text_data_into_memory( training_path='tests/resources/test-tr', test_path='tests/resources/test-ev', ) tokenizer = XmlTokenizer() x_train, y_train, x_test, y_test = tokenize_data(raw_data, tokenizer, data_ids) if vector_source == 'default': unigrams_vect = Vectors.from_tsv('tests/resources/thesauri/exp0-0a.txt.events-unfiltered.strings') vector_source = unigrams_vect if delete_kid: # the set of vectors we load from disk covers all unigrams in the training set, which makes it boring # let's remove one entry del unigrams_vect['kid/N'] unigrams_vect.matrix = unigrams_vect.matrix[:, :-1] if max_feature_len == 1: # extract only unigram features feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'], 'extract_phrase_features': []} standard_ngram_features = 0 else: feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'], 'extract_phrase_features': ['AN', 'NN', 'VO', 'SVO']} standard_ngram_features = max_feature_len feature_extractor = FeatureExtractor(standard_ngram_features=standard_ngram_features).update(**feat_extr_opts) pipeline_list = [ ('vect', ThesaurusVectorizer(min_df=1, use_tfidf=False, decode_token_handler=handler_pattern.format(handler))), ('fs', VectorBackedSelectKBest(must_be_in_thesaurus=must_be_in_thesaurus, k=k)), ('dumper', FeatureVectorsCsvDumper('fs-test')) ] p = Pipeline(pipeline_list) fit_params = {'vect__vector_source': vector_source, 'vect__train_time_extractor':feature_extractor, 'vect__decode_time_extractor':feature_extractor, 'fs__vector_source': vector_source} tr_matrix, tr_voc = p.fit_transform(x_train, y_train, **fit_params) if 'fs' in p.named_steps: p.named_steps['vect'].vocabulary_ = p.named_steps['fs'].vocabulary_ ev_matrix, ev_voc = p.transform(x_test) return tr_matrix.A, strip(tr_voc), ev_matrix.A, strip(ev_voc)
def cluster_vectors(path_to_vectors, output_path, n_clusters=100, noise=0, n_jobs=4): vectors = Vectors.from_tsv(path_to_vectors, noise=noise) km = KMeans(n_clusters=n_clusters, n_jobs=n_jobs, random_state=0, verbose=1) clusters = km.fit_predict(vectors.matrix) num2word = np.array(vectors.row_names) idx = np.argsort(num2word) df = pd.DataFrame(dict(clusters=clusters[idx]), index=num2word[idx]) df.to_hdf(output_path, key='clusters', complevel=9, complib='zlib')
def merge_vectors(composed_dir, unigrams, output, workers=4, chunk_size=10000): # this particular dataset uses spaces instead of underscores. State this to avoid parsing issues DocumentFeature.ngram_separator = " " DIMS = 100 # SVD dimensionality files = glob(os.path.join(composed_dir, "*apt.vec.gz")) logging.info("Found %d composed phrase files", len(files)) # ignore stuff that isn't unigrams, it will cause problems later unigrams = Vectors.from_tsv(unigrams, row_filter=lambda x, y: y.type == "1-GRAM") logging.info("Found %d unigram vectors", len(unigrams)) mat, cols, rows = unigrams.to_sparse_matrix() unigrams.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))} cols = set(cols) svd = TruncatedSVD(DIMS, random_state=0) logging.info("Reducing dimensionality of matrix of shape %r...", mat.shape) start = time.time() reduced_mat = svd.fit_transform(mat) logging.info( "Reduced using {} from shape {} to shape {} in {} seconds".format( svd, mat.shape, reduced_mat.shape, time.time() - start ) ) write_vectors_to_hdf( reduced_mat, rows, ["SVD:feat{0:03d}".format(i) for i in range(reduced_mat.shape[1])], "%s-unigrams-SVD%d" % (output, DIMS), ) del mat for i, chunk in enumerate(grouper(chunk_size, files)): d = {} logging.info("Reading composed vectors, chunk %d...", i) for phrase, features in Parallel(n_jobs=workers)(delayed(_read_vector)(f) for f in chunk if f): if features: d[phrase] = features logging.info("Found %d non-empty composed vectors in this chunk, running SVD now...", len(d)) if not d: continue composed_vec = Vectors(d, column_filter=lambda foo: foo in cols) # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match # "project" composed matrix into space of unigram thesaurus extra_matrix = unigrams.v.transform([dict(fv) for fv in composed_vec.values()]) assert extra_matrix.shape == (len(composed_vec), len(cols)) logging.info("Composed matrix is of shape %r before SVD", extra_matrix.shape) extra_matrix = svd.transform(extra_matrix) write_vectors_to_hdf( extra_matrix, list(composed_vec.keys()), ["SVD:feat{0:03d}".format(i) for i in range(extra_matrix.shape[1])], "%s-phrases-chunk%d-SVD%d" % (output, i, DIMS), ) del composed_vec
def _generate_hdf_gzip_repr(kind, tmpdir, v): if kind == 'txt': # just read the plaintext file return v else: outfile = str(tmpdir.join('events.txt')) if kind == 'gz': v.to_tsv(outfile, gzipped=True) if kind == 'hdf': v.to_tsv(outfile, dense_hd5=True) return Vectors.from_tsv(outfile)
def _translate_byblo_to_dissect(events_file, row_transform=lambda x: x): """ Translates Byblo-made vectors file to dissect format in the absence of features/entries files :param events_file: path to byblo-made vectors :type events_file: str :return: prefix of dissect-compatible data files :rtype: str """ # remove duplicate head noun vectors, converting to a dissect sparse matrix format logging.info('Converting %s to DISSECT format', events_file) t = Vectors.from_tsv(events_file) t.to_dissect_sparse_files(events_file, row_transform=row_transform)
def test_all_neighbours_overlap(call_init): FEATURE = 'daily/J_pais/N' v = Vectors.from_tsv('tests/resources/only_overlapping.txt', allow_lexical_overlap=False) mv = MultiVectors([v] * 3) if call_init: mv.init_sims() assert FEATURE in v assert FEATURE in mv # feature is contained in vector set, but... # when we look up its neighbours, they all overlap, so nothing is left assert mv.get_nearest_neighbours(FEATURE) == [] assert mv.get_nearest_neighbours('asdf') == [] assert mv.get_nearest_neighbours('pais/N') is not None
def test_loading_unordered_feature_lists(tmpdir): d = { 'a/N': [('f1', 1), ('f2', 2), ('f3', 3)], 'b/N': [('f3', 3), ('f1', 1), ('f2', 2), ], 'c/N': [('f3', 3), ('f2', 2), ('f1', 1)], } # three identical vectors v = Vectors(d) filename = str(tmpdir.join('outfile.txt')) v.to_tsv(filename) v1 = v.from_tsv(filename) assert v.columns == v1.columns # rows can be in any order, but columns need to be sorted for word in d.keys(): assert_array_equal(v.get_vector(word).A, v1.get_vector(word).A)
def test_application_after_learning_with_selective_write(tmpdir): """ Test if when SVD is trained on matrix A and applied to matrix B, and it is requested that just the reduced version of only A or B is output, the shape of the output is right """ tmpfile = tmpdir.join('tmp.thesaurus') for w, exp_row_len in zip([1, 2, 3], [4, 5, 7]): do_svd('discoutils/tests/resources/exp0-0b.strings', tmpfile, reduce_to=[2], # some small number, not what we are testing for here apply_to='discoutils/tests/resources/exp0-0c.strings', write=w) t = Vectors.from_tsv(str(tmpfile) + '-SVD2.events.filtered.strings', lowercasing=False) mat, _, _ = t.to_sparse_matrix() assert mat.shape == (exp_row_len, 2)
def test_application_after_learning(tmpdir, first, second, exp_row_len): """ Test of applying a learnt SVD to another matrix works. We are mostly interested if matrix dimensions match- no exception should be raised. Other than that, this is a useless test """ tmpfile = tmpdir.join('tmp.thesaurus') do_svd('discoutils/tests/resources/exp0-0%s.strings' % first, tmpfile, reduce_to=[2], # some small number, not what we are testing for here apply_to='discoutils/tests/resources/exp0-0%s.strings' % second) # when made into a thesaurus, the reduced matrix will have some duplicates # these will be summed out, leaving us with a matrix of a specific size t = Vectors.from_tsv(str(tmpfile) + '-SVD2.events.filtered.strings', lowercasing=False) mat, cols, rows = t.to_sparse_matrix() assert mat.shape == (exp_row_len, 2)
def test_vectors_to_tsv(vectors_c, tmpdir): """ :type vectors_c: Vectors :type tmpdir: py.path.local """ # these are feature vectors, columns(features) can be reordered filename = str(tmpdir.join('outfile.txt')) vectors_c.to_tsv(filename, gzipped=True) from_disk = Vectors.from_tsv(filename) if hasattr(vectors_c, 'df'): # this is in dense format np.testing.assert_array_equal(vectors_c.matrix, from_disk.matrix) else: # sparse format: can't just assert from_disk == thesaurus_c, because to_tsv may reorder the columns for k, v in vectors_c.items(): assert k in from_disk.keys() assert set(v) == set(vectors_c[k])
def build_thesaurus_out_of_vectors(vectors_path, out_dir, threads=4, num_neighbours=100, sim_function='Cosine'): """ Builds a Byblo thesaurus out of the provided vectors, however these were constructed. This function will make an uncompressed copy of the provided vectors file- might be slow and use up a lot of extra space. :param vectors_path: input vectors in byblo format, compressed or not :param out_dir: where to put the thesaurus and all temp file :param threads: number of byblo threads :param num_neighbours: number of nearest neighbours per entry to output :param sim_function: similarity measure between vectors to use. see byblo docs """ from discoutils.thesaurus_loader import Vectors BYBLO_BASE_DIR = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/Byblo-2.2.0' vectors_path = os.path.abspath(vectors_path) out_dir = os.path.abspath(out_dir) mkdirs_if_not_exists(out_dir) v = Vectors.from_tsv(vectors_path) # prepare the files that byblo expects outf_basename = os.path.join(out_dir, 'input') events_file = os.path.join(out_dir, outf_basename + '.events.filtered.strings') entries_file = os.path.join(out_dir, outf_basename + '.entries.filtered.strings') features_file = os.path.join(out_dir, outf_basename + '.features.filtered.strings') v.to_plain_txt(events_file, entries_file, features_file) # write the byblo conf file conf = '--input {} --output {} --threads {} --similarity-min 0.01 -k {} ' \ '--measure {} --stages allpairs,knn,unenumerate'.format(outf_basename, out_dir, threads, num_neighbours, sim_function) conf_path = os.path.join(out_dir, 'conf.txt') with open(conf_path, 'w') as outf: for line in conf.split(): outf.write(line) outf.write('\n') # go baby go with temp_chdir(BYBLO_BASE_DIR): reindex_all_byblo_vectors(outf_basename) run_byblo(conf_path, touch_input_file=True) unindex_all_byblo_vectors(outf_basename)
def do_svd(input_path, output_prefix, desired_counts_per_feature_type=[('N', 8), ('V', 4), ('J', 4), ('RB', 2), ('AN', 2)], reduce_to=[3, 10, 15], apply_to=None, write=3, use_hdf=True): """ Performs truncated SVD. A copy of the trained sklearn SVD estimator will be also be saved :param input_path: list of files containing vectors in TSV format. All vectors will be reduced together. :type input_path: list of file names or a Vectors object :param output_prefix: Where to output the reduced files. An extension will be added. :param desired_counts_per_feature_type: how many entries to keep of each DocumentFeature type, by frequency. This is the PoS tag for unigram features and the feature type otherwise. For instance, pass in [('N', 2), ('AN', 0)] to select 2 unigrams of PoS N and 0 bigrams of type adjective-noun. Types that are not explicitly given a positive desired count are treated as if the desired count is 0. If this is None, not filtering is performed. :param reduce_to: list of integers, what dimensionalities to reduce to :param apply_to: a file path. After SVD has been trained on input_path, it can be applied to apply_to. Output will be writen to the same file :param write: Once SVD is trained on A and applied to B, output either A, B or vstack(A, B). Use values 0, 1, and 2 respectively. Default is 3. :param use_hdf: if true, store results as a pandas DF in HDF. This will enforce some constraints like not having duplicate entries in the index, which I deliberately break with some of the unit tests. This switch is the easiest way to avoid modifying the unit tests :type write: int :raise ValueError: If the loaded thesaurus is empty """ if not 1 <= write <= 3: raise ValueError('value of parameter write must be 1, 2 or 3') if not isinstance(input_path, Vectors): thesaurus = Vectors.from_tsv(input_path, lowercasing=False) else: thesaurus = input_path if not thesaurus: raise ValueError('Empty thesaurus %r', input_path) mat, _, rows, cols = filter_out_infrequent_entries(desired_counts_per_feature_type, thesaurus) if apply_to: cols = set(cols) if not isinstance(apply_to, Vectors): thes_to_apply_to = Vectors.from_tsv(apply_to, lowercasing=False, column_filter=lambda foo: foo in cols) else: thes_to_apply_to = apply_to # get the names of each thesaurus entry extra_rows = [x for x in thes_to_apply_to.keys()] # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match # "project" second thesaurus into space of first thesaurus thesaurus.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))} extra_matrix = thesaurus.v.transform([dict(fv) for fv in thes_to_apply_to.values()]) # make sure the shape is right assert extra_matrix.shape[1] == mat.shape[1] if write == 3: # extend the list of names rows = list(rows) + [DocumentFeature.from_string(x) for x in extra_rows] elif write == 2: rows = [DocumentFeature.from_string(x) for x in extra_rows] # no need to do anything if write == 1 for n_components in reduce_to: method, reduced_mat = _do_svd_single(mat, n_components) if not method: continue if apply_to: logging.info('Applying learned SVD transform to matrix of shape %r', extra_matrix.shape) # apply learned transform to new data if write == 3: # append to old data reduced_mat = np.vstack((reduced_mat, method.transform(extra_matrix))) elif write == 2: reduced_mat = method.transform(extra_matrix) path = '{}-SVD{}'.format(output_prefix, n_components) _write_to_disk(scipy.sparse.coo_matrix(reduced_mat), path, rows, use_hdf=use_hdf)
def train_grefenstette_multistep_composer(all_vectors_file, root_dir): """ Train Grefenstette et al's multistep regression VO/SVO model Adapted from dissect's ex19.py :param all_vectors_file: file containing N, V, VO and SVO vectors :param root_dir: where to write temp files and output """ mkdirs_if_not_exists(root_dir) vo_composer_output_file = join(root_dir, 'vo_comp.pkl') svo_composer_output_file = join(root_dir, 'svo_comp.pkl') filename = basename(all_vectors_file) noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename) # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename) # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename) svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename) # this has unigrams and observed phrases thes = Vectors.from_tsv(all_vectors_file) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) # thes.to_tsv(verb_events_file, # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V') # _translate_byblo_to_dissect(verb_events_file) # thes.to_tsv(vo_events_file, # entry_filter=lambda x: x.type == 'VO') # _translate_byblo_to_dissect(vo_events_file) thes.to_tsv(svo_events_file, entry_filter=lambda x: x.type == 'SVO') _translate_byblo_to_dissect(svo_events_file) train_vo_data, train_v_data = [], [] for phrase in thes.keys(): df = DocumentFeature.from_string(phrase) if df.type == 'SVO': train_vo_data.append((str(df[1:]), str(df[0]), str(df))) if df.type == 'VO': train_v_data.append((str(df[0]), str(df[1]), str(df))) # logging.info('train_vo_data %r', len(train_vo_data)) # logging.info('train_v_data %r', len(train_v_data)) # load N and SVO spaces n_space = Space.build(data=noun_events_file + '.sm', cols=noun_events_file + '.cols', format="sm") svo_space = Space.build(data=svo_events_file + '.sm', cols=svo_events_file + '.cols', format="sm") logging.info("Input SVO training space:") logging.info(svo_space.id2row) # logging.info(svo_space.cooccurrence_matrix) # 1. train a model to learn VO functions on train data: VO N -> SVO logging.info("Step 1 training") vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) # Gref et al 2013, §5 says 3 vo_model.train(train_vo_data, n_space, svo_space) io_utils.save(vo_model, vo_composer_output_file) # 2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 logging.info("Step 2 training") vo_space = vo_model.function_space v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) v_model.train(train_v_data, n_space, vo_space) io_utils.save(v_model, svo_composer_output_file)
def train_baroni_guevara_composers(all_vectors, ROOT_DIR, baroni_output_path, guevara_output_path, baroni_threshold=10): """ :type all_vectors: str; path to vectors file containing both N and observed AN vectors :type ROOT_DIR: str; where to write temp files :type baroni_output_path: str; where to write pickled baroni composer :type guevara_output_path: str :type baroni_threshold: int """ SVD_DIMS = 100 baroni_training_phrase_types = {'AN', 'NN'} # what kind of NPs to train Baroni composer for # prepare the input files to be fed into Dissect mkdirs_if_not_exists(ROOT_DIR) filename = basename(all_vectors) noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS)) NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS)) thes = Vectors.from_tsv(all_vectors, lowercasing=False) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) thes.to_tsv(NPs_events_file, entry_filter=lambda x: x.type in baroni_training_phrase_types, row_transform=lambda x: str(x).replace(' ', '_')) _translate_byblo_to_dissect(NPs_events_file) my_space = Space.build(data="{}.sm".format(noun_events_file), rows="{}.rows".format(noun_events_file), cols="{}.cols".format(noun_events_file), format="sm") logging.info('Each unigram vector has dimensionality %r', my_space.element_shape) # create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="{}.sm".format(NPs_events_file), rows="{}.rows".format(NPs_events_file), # The columns of the peripheral space have to be identical to those # in the core space (including their order)! cols="{}.cols".format(NPs_events_file), format="sm") logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape) # use the model to compose words in my_space all_data = [] for phrase in my_per_space._row2id: # make sure there are only NPs here if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types: adj, noun = phrase.split('_') all_data.append((adj, noun, '%s_%s' % (adj, noun))) # train a composition model on the data and save it baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner()) guevara = FullAdditive(learner=RidgeRegressionLearner()) for composer, out_path in zip([baroni, guevara], [baroni_output_path, guevara_output_path]): composer.train(all_data, my_space, my_per_space) io_utils.save(composer, out_path) logging.info('Saved trained composer to %s', out_path)
def small_vectors(): return Vectors.from_tsv('tests/resources/thesauri/small.txt.events.strings')
def compose_and_write_vectors(unigram_vectors, short_vector_dataset_name, composer_classes, remove_pos= False, pretrained_Baroni_composer_file=None, pretrained_Guevara_composer_file=None, pretrained_Gref_composer_file=None, categorical_vector_matrix_file=None, output_dir='.', gzipped=True, dense_hd5=False, row_filter=default_row_filter): """ Extracts all composable features from a labelled classification corpus and dumps a composed vector for each of them to disk. The output file will also contain all unigram vectors that were passed in, and only unigrams! :param unigram_vectors: a file in Byblo events format that contain vectors for all unigrams OR a Vectors object. This will be used in the composition process. :type unigram_vectors: str or Vectors :param classification_corpora: Corpora to extract features from. Dict {corpus_path: conf_file} :param pretrained_Baroni_composer_file: path to pre-trained Baroni AN/NN composer file :param output_dir: :param composer_classes: what composers to use :type composer_classes: list """ phrases_to_compose = get_all_document_features(remove_pos=remove_pos) # if this isn't a Vectors object assume it's the name of a file containing vectors and load them if not isinstance(unigram_vectors, Vectors): # ensure there's only unigrams in the set of unigram vectors # composers do not need any ngram vectors contain in this file, they may well be # observed ones unigram_vectors = Vectors.from_tsv(unigram_vectors, row_filter=row_filter) logging.info('Starting composition with %d unigram vectors', len(unigram_vectors)) # doing this loop in parallel isn't worth it as pickling or shelving `vectors` is so slow # it negates any gains from using multiple cores for composer_class in composer_classes: if composer_class == BaroniComposer: assert pretrained_Baroni_composer_file is not None composer = BaroniComposer(unigram_vectors, pretrained_Baroni_composer_file) elif composer_class == GuevaraComposer: assert pretrained_Guevara_composer_file is not None composer = GuevaraComposer(unigram_vectors, pretrained_Guevara_composer_file) elif composer_class == GrefenstetteMultistepComposer: assert pretrained_Gref_composer_file is not None composer = GrefenstetteMultistepComposer(unigram_vectors, pretrained_Gref_composer_file) elif composer_class in [CopyObject, FrobeniusAdd, FrobeniusMult]: composer = composer_class(categorical_vector_matrix_file, unigram_vectors) else: composer = composer_class(unigram_vectors) try: # compose_all returns all unigrams and composed phrases mat, cols, rows = composer.compose_all(phrases_to_compose) events_path = os.path.join(output_dir, 'composed_%s_%s.events.filtered.strings' % (short_vector_dataset_name, composer.name)) if dense_hd5: write_vectors_to_hdf(mat, rows, cols, events_path) else: rows2idx = {i: DocumentFeature.from_string(x) for (x, i) in rows.items()} write_vectors_to_disk(mat.tocoo(), rows2idx, cols, events_path, entry_filter=lambda x: x.type in {'AN', 'NN', 'VO', 'SVO', '1-GRAM'}, gzipped=gzipped) except ValueError as e: logging.error('RED ALERT, RED ALERT') logging.error(e) continue
def ones_vectors_no_pos(): return Vectors.from_tsv('tests/resources/ones.vectors.nopos.txt', enforce_word_entry_pos_format=False)
def get_pipeline_fit_args(conf): """ Builds a dict of resources that document vectorizers require at fit time. These currently include various kinds of distributional information, e.g. word vectors or cluster ID for words and phrases. Example: {'vector_source': <DenseVectors object>} or {'clusters': <pd.DataFrame of word clusters>} :param conf: configuration dict :raise ValueError: if the conf is wrong in any way """ result = dict() train_time_extractor = FeatureExtractor().update(**conf['feature_extraction']). \ update(**conf['feature_extraction']['train_time_opts']) result['train_time_extractor'] = train_time_extractor decode_time_extractor = FeatureExtractor().update(**conf['feature_extraction']). \ update(**conf['feature_extraction']['decode_time_opts']) result['decode_time_extractor'] = decode_time_extractor vectors_exist = conf['feature_selection']['must_be_in_thesaurus'] handler_ = conf['vectorizer']['decode_token_handler'] random_thes = conf['vectorizer']['random_neighbour_thesaurus'] dummy_thes = conf['vector_sources']['dummy_thesaurus'] vs_params = conf['vector_sources'] vectors_path = vs_params['neighbours_file'] clusters_path = vs_params['clusters_file'] if 'Base' in handler_: # don't need vectors, this is a non-distributional experiment return result if vectors_path and clusters_path: raise ValueError('Cannot use both word vectors and word clusters') if random_thes and dummy_thes: raise ValueError('Cant use both random and dummy thesauri') elif random_thes: result['vector_source'] = RandomThesaurus(k=conf['vectorizer']['k']) elif dummy_thes: result['vector_source'] = DummyThesaurus() else: if vectors_path and clusters_path: raise ValueError('Cannot use both word vectors and word clusters') if 'signified' in handler_.lower() or vectors_exist: # vectors are needed either at decode time (signified handler) or during feature selection if not (vectors_path or clusters_path): raise ValueError('You must provide at least one source of distributional information ' 'because you requested %s and must_be_in_thesaurus=%s' % (handler_, vectors_exist)) if len(vectors_path) == 1: # set up a row filter, if needed entries = vs_params['entries_of'] if entries: entries = get_thesaurus_entries(entries) vs_params['row_filter'] = lambda x, y: x in entries if conf['vector_sources']['is_thesaurus']: result['vector_source'] = Thesaurus.from_tsv(vectors_path[0], **vs_params) else: result['vector_source'] = Vectors.from_tsv(vectors_path[0], **vs_params) if len(vectors_path) > 1: all_vect = [Vectors.from_tsv(p, **vs_params) for p in vectors_path] result['vector_source'] = MultiVectors(all_vect) if clusters_path: result['clusters'] = pd.read_hdf(clusters_path, key='clusters') return result
def _overlapping_vectors(request): return Vectors.from_tsv('discoutils/tests/resources/lexical-overlap-vectors.txt', allow_lexical_overlap=request.param)
def vectors_a(): return Vectors.from_tsv('tests/resources/exp0-0a.strings')
def ones_vectors(): return Vectors.from_tsv('tests/resources/ones.vectors.txt')
def vectors_c(request, tmpdir): kind = request.param # txt, gz or hdf v = Vectors.from_tsv('discoutils/tests/resources/exp0-0c.strings', sim_threshold=0, ngram_separator='_') assert DocumentFeature.from_string('oversized/J') not in v assert len(v) == 5 return _generate_hdf_gzip_repr(kind, tmpdir, v)
def test_random_vectors(tmpdir): output = str(tmpdir.join('vectors.h5')) generate(output, 10) v = Vectors.from_tsv(output) assert v.matrix.shape[1] == 10
def get_thesaurus_entries(tsv_file): """ Returns the set of entries contained in a thesaurus :param tsv_file: path to vectors file """ return set(Vectors.from_tsv(tsv_file).keys())
def _do_ppmi(vectors_path, output_dir): v = Vectors.from_tsv(vectors_path) ppmi_sparse_matrix(v.matrix) v.to_tsv(join(output_dir, basename(vectors_path)), gzipped=True)