Example #1
0
def test_smart_lower():
    # test that the PoS of an n-gram entry is not lowercased
    assert DocumentFeature.smart_lower('Cat/N') == 'cat/N'
    assert DocumentFeature.smart_lower('Cat/n') == 'cat/n'
    assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N'
    assert DocumentFeature.smart_lower('Red/J CaT/N', separator=' ') == 'red/J cat/N'
    # test that features are not touched
    assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former'
Example #2
0
    def remove_overlapping_neighbours(cls, entry, to_insert):
        """

        :type entry: DocumentFeature or str
        :type to_insert: list of (str, float) tuples
        """
        if isinstance(entry, (six.string_types, six.text_type)):
            entry = DocumentFeature.from_string(entry)
        features = [(DocumentFeature.from_string(x[0]), x[1]) for x in to_insert]
        to_insert = [(f[0].tokens_as_str(), f[1]) for f in features
                     if not any(t in entry.tokens for t in f[0].tokens)]
        return to_insert
Example #3
0
def test_document_feature_slicing():
    DocumentFeature.recompile_pattern()
    x = DocumentFeature.from_string('big/J_cat/N')
    assert x[0] == DocumentFeature.from_string('big/J')
    assert x[1] == DocumentFeature.from_string('cat/N')
    assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), ))
    assert x[0:] == DocumentFeature.from_string('big/J_cat/N')

    x = DocumentFeature.from_string('cat/N')
    assert x[0] == DocumentFeature.from_string('cat/N')
    assert x[0:] == DocumentFeature.from_string('cat/N')
    assert x[:] == DocumentFeature.from_string('cat/N')
 def contains_impl(self, feature):
     if isinstance(feature, six.string_types):
         feature = DocumentFeature.from_string(feature)
     if feature.type not in self.entry_types:
         # no point in composing single-word document features
         return False
     return str(feature[self.hardcoded_index]) in self.unigram_source
    def to_tsv(self, events_path, entries_path='', features_path='',
               entry_filter=lambda x: True, row_transform=lambda x: x,
               gzipped=False, enforce_word_entry_pos_format=True, dense_hd5=False):
        """
        Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the
        process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the
        features of each entry.

        :param events_path: file to write to
        :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will
         only be written if this callable return true
        :param row_transform: Callable, any transformation that might need to be done to each entry before converting
         it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not
         directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to
         DocumentFeature, e.g. if the data isn't PoS tagged.
         :param dense_hd5: if true, convert to a pandas `DataFrame` and write to a compressed HDF file. This is a 30%
          faster and produces 30% smaller files than using `gzipped`. This is only suitable for matrices with a small
          number of columns- this method enforces a hard limit of 1000.
          Requires PyTables and HDF5.
        :return: the file name
        """
        if enforce_word_entry_pos_format:
            rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()}
        else:
            rows = {i: feat for (feat, i) in self.name2row.items()}

        if dense_hd5 and len(self.columns) <= 1000:
            write_vectors_to_hdf(self.matrix, self.row_names, self.columns, events_path)
        else:
            write_vectors_to_disk(coo_matrix(self.matrix), rows, self.columns, events_path,
                                  features_path=features_path, entries_path=entries_path,
                                  entry_filter=entry_filter, gzipped=gzipped)
        return events_path
def reformat_socher_vectors():
    """
    Formats the files output by Socher (2011)'s matlab code into byblo-compatible files.

    Before running this a list of all phrases needs to be extracted from the labelled data, and these need to
    be composed with Socher's matlab code. See note "Socher vectors" in Evernote.

    """
    logging.info('Reformatting events file %s ---> %s',
                 socher_output_vectors_file, socher_composed_vectors_file)

    # socher's code removes all PoS tags, so we can't translate his output
    # back to a DocumentFeature. Let's read the input to his code instead and
    # get the corresponding output vectors
    # get a list of all phrases that we attempted to compose
    with open(plaintext_socher_input_file) as infile:
        composed_phrases = [DocumentFeature.from_string(line.strip()) for line in infile]

    # get a list of all phrases where composition worked (no unknown words)
    with open(socher_output_phrases_file) as infile:
        success = [i for i, line in enumerate(infile) if '*UNKNOWN*' not in line]
        # pick out just the phrases that composes successfully
    composed_phrases = itemgetter(*success)(composed_phrases)

    # load all vectors, remove these containing unknown words
    mat = np.loadtxt(socher_output_vectors_file, delimiter=',')
    mat = mat[success, :]
    assert len(composed_phrases) == mat.shape[0]  # same number of rows

    # do the actual writing
    write_vectors_to_hdf(sp.coo_matrix(mat),
                         composed_phrases,
                         ['RAE-feat%d' % i for i in range(100)],  # Socher provides 100-dimensional vectors
                         socher_composed_vectors_file)
def train_verb_tensors(svos_file, noun_vectors_file, output_filename):
    """
    Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, ยง3)
    :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document
     features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py`
    :param noun_vectors_file: a vector store containing noun vectors
    :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus
    """
    mkdirs_if_not_exists(os.path.dirname(output_filename))

    v = Vectors.from_tsv(noun_vectors_file)

    with open(svos_file) as infile:
        phrases = set()
        for line in infile:
            if DocumentFeature.from_string(line.strip()).type == 'SVO':
                phrases.add(tuple(line.strip().split('_')))
    phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v]
    phrases = sorted(phrases, key=itemgetter(1))
    logging.info('Found %d SVOs in list', len(phrases))

    verb_tensors = dict()
    for verb, svos in groupby(phrases, itemgetter(1)):
        svos = list(svos)
        if len(svos) < MIN_SVO_PER_VERB:
            continue
        logging.info('Training matrix for %s from %d SVOs', verb, len(svos))
        vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos)
        verb_tensors[verb] = vt

    logging.info('Trained %d verb matrices, saving...', len(verb_tensors))
    for verb, tensor in verb_tensors.items():
        df = pd.DataFrame(tensor)
        df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
def get_all_document_features(include_unigrams=False, remove_pos=False):
    """
    Finds all noun-noun and adj-noun compounds (and optionally adjs and nouns) in all labelled corpora
    mentioned in the conf files.
    :param include_unigrams: if False, only NPs will be returned
    :param remove_pos: whether to remove PoS tags if present, result will be either "cat/N" or "cat"
    :rtype: set of DocumentFeature
    """
    result = set()
    accepted_df_types = {'AN', 'NN', 'VO', 'SVO', '1-GRAM'} if include_unigrams else {'AN', 'NN', 'VO', 'SVO'}
    for corpus_name, _ in get_all_corpora():
        path = os.path.abspath(os.path.join(__file__, '..', '..', '..', ROOT, '%s_all_features.txt' % corpus_name))
        with open(path) as infile:
            for line in infile:
                df = DocumentFeature.from_string(line.strip())
                if df.type in accepted_df_types:
                    if remove_pos:
                        # todo these are of type str, in the other branch it's DocumentFeature. things will likely break
                        result.add(df.ngram_separator.join(t.text for t in df.tokens))
                    else:
                        result.add(df)

    logging.info('Found a total of %d features in all corpora', len(result))
    if not remove_pos:
        logging.info('Their types are %r', Counter(df.type for df in result))
    if include_unigrams:
        logging.info('PoS tags of unigrams are are %r',
                     Counter(df.tokens[0].pos for df in result if df.type == '1-GRAM'))
    else:
        logging.info('Unigram features not included!')
    return result
Example #9
0
def test_write_vectors_to_disk(resources, tmpdir):
    """
    Checks the entries/features files, the events file is checked by
    thesisgenerator.tests.test_thesaurus.test_to_file

    :type th: Thesaurus
    """
    th, expected_entries, expected_features, filter_callable = resources
    events_file = str(tmpdir.join('events.txt'))
    entries_file = str(tmpdir.join('entries.txt'))
    features_file = str(tmpdir.join('features.txt'))

    if not th: # empty thesaurus should raise an error
        with pytest.raises(ValueError):
            matrix, cols, rows = th.to_sparse_matrix()
    else:
        matrix, cols, rows = th.to_sparse_matrix()
        rows = [DocumentFeature.from_string(x) for x in rows]
        write_vectors_to_disk(sp.coo_matrix(matrix), rows, cols,
                              events_file, features_file, entries_file,
                              entry_filter=filter_callable)

        if expected_entries:
            # the file will not be written at all if there's nothing to put in it
            entries = [x.split('\t')[0] for x in _read_and_strip_lines(entries_file)]
            assert set(entries) == set(expected_entries)
        else:
            assert not os.path.exists(entries_file)

        if expected_features:
            features = [x.split('\t')[0] for x in _read_and_strip_lines(features_file)]
            assert features == expected_features
        else:
            assert not os.path.exists(features_file)
 def __iter__(self):
     for fname in self.files:
         filename = join(self.dirname, fname)
         infile = gzip.open(filename) if is_gzipped(filename) else open(filename)
         with contextlib.closing(infile):
             for line in infile:
                 # yield gensim.utils.tokenize(line, lower=True)
                 if isinstance(line, bytes):
                     line = line.decode()
                 res = [DocumentFeature.smart_lower(w) for w in line.split() if
                        DocumentFeature.from_string(w).type != 'EMPTY']
                 if len(res) > 8:
                     # ignore short sentences, they are probably noise
                     if self.remove_pos:
                         yield [x.split('/')[0] for x in res]
                     else:
                         yield res
Example #11
0
 def get_vector(self, feature):
     """
     :type feature: DocumentFeature
     :rtype: scipy.sparse.csr_matrix
     """
     if isinstance(feature, six.string_types):
         feature = DocumentFeature.from_string(feature)
     return sp.csr_matrix(reduce(self.function,
                                 [self.unigram_source.get_vector(str(t)).A for t in feature[:]]))
Example #12
0
    def __contains__(self, feature):
        if isinstance(feature, six.string_types):
            feature = DocumentFeature.from_string(feature)

        # this is a SVO, we have a verb tensor and vectors for both arguments
        return feature.type in self.entry_types and \
               str(feature[1]) in self.verb_tensors and \
               str(feature[0]) in self.unigram_source and \
               str(feature[2]) in self.unigram_source
Example #13
0
def test_with_different_separators():
    DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very_RB!big_J')

    DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ')
    assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very-RB big-J')
Example #14
0
    def __contains__(self, feature):
        # both head and modifier need to have unigram vectors.
        # I don't see why the modifier needs a vector, given that we're using
        # its matrix representation instead, but that is what dissect does
        if isinstance(feature, six.string_types):
            feature = DocumentFeature.from_string(feature)

        if feature.type not in self.entry_types:
            # no point in trying
            return False
        return all(str(f) in self.unigram_source for f in feature[:])
    def _paraphrase(self, feature, vocabulary, j_indices, values, stats, **kwargs):
        """
        Replaces term with its k nearest neighbours from the thesaurus

        Parameters
        ----------
        neighbour_source : callable, returns a thesaurus-like object (a list of
          (neighbour, sim) tuples, sorted by highest sim first,
          acts as a defaultdict(list) ). The callable takes one parameter for
          compatibility purposes- one of the possible callables I want to
          use here requires access to the vocabulary.
           The default behaviour is to return a callable pointing to the
           currently loaded thesaurus.
        """

        # logging.debug('Paraphrasing %r in doc %d', feature, doc_id)
        neighbours = self.thesaurus.get_nearest_neighbours(feature)
        if self.thesaurus.__class__.__name__ == 'Thesaurus':
            # todo this will also activate for DenseVectors, because they are also instances of thesaurus
            # the check needs to be self.thesaurus.__class__.__name__ == 'Thesaurus', but then
            # we need to make sure init_sims is called with the correct vocabulary so that all neighbours are IV

            # precomputed thesauri do not guarantee that the returned neighbours will be in vocabulary
            # these should by now only the used in testing though
            neighbours = [(neighbour, sim) for (neighbour, sim) in neighbours
                          if DocumentFeature.from_string(neighbour) in vocabulary]
        event = [str(feature), len(neighbours)]
        for neighbour, sim in neighbours[:self.k]:
            # the document may already contain the feature we
            # are about to insert into it,
            # a merging strategy is required,
            # e.g. what do we do if the document has the word X
            # in it and we encounter X again. By default,
            # scipy uses addition
            df = DocumentFeature.from_string(neighbour)
            j_indices.append(vocabulary.get(df))
            values.append(self.sim_transformer(sim))
            # track the event
            event.extend([neighbour, sim])
        stats.register_paraphrase(tuple(event))
def filter_out_infrequent_entries(desired_counts_per_feature_type, vectors):
    logging.info('Converting thesaurus to sparse matrix')
    mat, cols, rows = vectors.to_sparse_matrix()
    logging.info('Got a data matrix of shape %r', mat.shape)
    # convert to document feature for access to PoS tag
    document_features = [DocumentFeature.from_string(r) for r in rows]
    # don't want to do dimensionality reduction on composed vectors
    feature_types = [sorted_idx_and_pos_matching.type for sorted_idx_and_pos_matching in document_features]
    assert all(x == '1-GRAM' or x == 'AN' or x == 'NN' for x in feature_types), Counter(feature_types)
    # get the PoS tags of each row in the matrix
    pos_tags = np.array([df.tokens[0].pos if df.type == '1-GRAM' else df.type for df in document_features])
    # find the rows of the matrix that correspond to the most frequent nouns, verbs, ...,
    # as measured by sum of feature counts. This is Byblo's definition of frequency (which is in fact a marginal),
    # but it is strongly correlated with one normally thinks of as entry frequency
    desired_rows = []
    if desired_counts_per_feature_type is not None:
        for desired_pos, desired_count in desired_counts_per_feature_type:
            row_of_current_pos = pos_tags == desired_pos  # what rows are the right PoS tags at, boolean mask array
            # indices of the array sorted by row sum, and where the pos == desired_pos
            if desired_count > 0:
                sorted_idx_by_sum = np.ravel(mat.sum(1)).argsort()
                row_of_current_pos = row_of_current_pos[sorted_idx_by_sum]
                sorted_idx_and_pos_matching = sorted_idx_by_sum[row_of_current_pos]
                # slice off the top desired_count and store them
                desired_rows.extend(list(sorted_idx_and_pos_matching[-desired_count:]))
            else:
                # do not include
                pass

            logging.info('Frequency filter keeping %d/%d %s entries ', desired_count,
                         sum(row_of_current_pos), desired_pos)
    else:
        logging.info('Not filtering any of the entries')
        desired_rows = range(len(vectors))

    # remove the vectors for infrequent entries, update list of pos tags too
    if desired_counts_per_feature_type is not None:
        # if some rows have been removed update respective data structures
        mat = mat[desired_rows, :]
        rows = itemgetter(*desired_rows)(document_features)
        pos_tags = pos_tags[desired_rows]

        # removing rows may empty some columns, remove these as well. This is probably not very like to occur as we have
        # already filtered out infrequent features, so the column count will stay roughly the same
        desired_cols = np.ravel(mat.sum(0)) > 0
        mat = mat[:, desired_cols]
        col_indices = list(np.where(desired_cols)[0])
        cols = itemgetter(*col_indices)(cols)

    logging.info('Selected only the most frequent entries, matrix size is now %r', mat.shape)
    assert mat.shape == (len(rows), len(cols))
    return mat, pos_tags, rows, cols
Example #17
0
    def contains_impl(self, feature):
        """
        Contains all sequences of words where we have a distrib vector for each unigram
        they contain. Rejects unigrams.
        """
        # if isinstance(feature, six.string_types):
        #     feature = DocumentFeature.from_string(feature)

        feat_str = str(feature) if isinstance(feature, DocumentFeature) else feature
        feat_df = feature if isinstance(feature, DocumentFeature) else DocumentFeature.from_string(feature)

        if feat_df.type not in self.entry_types:
            # no point in trying
            return False
        return all(f in self.unigram_source for f in feat_str.split(DocumentFeature.ngram_separator))
def _read_vector(vector_file):
    bn = os.path.basename(vector_file)
    sent_file = os.path.join(os.path.dirname(vector_file), "%s.sent" % bn.split(".")[0])
    if not os.path.exists(sent_file):
        return "__MISSING__", {}

    with open(sent_file) as infile:
        phrase = " ".join(line.strip().split("\t")[1] for line in infile if line.strip())

    with gzip.open(vector_file) as infile:
        file_content = infile.readline().decode("utf8").strip().split("\t")
    features = [
        (DocumentFeature.smart_lower(word, lowercasing=True), float(count))
        for (word, count) in walk_nonoverlapping_pairs(file_content, beg=0)
    ]
    return phrase, features
Example #19
0
    def __delitem__(self, key):
        """
        Deletes key from the list of entries in the thesaurus and the matrix
        :param key:
        :type key:
        :return:
        :rtype:
        """
        if isinstance(key, DocumentFeature):
            item = DocumentFeature.tokens_as_str(key)

        del self._obj[key]
        if hasattr(self, 'matrix'):
            mask = np.ones(self.matrix.shape[0], dtype=bool)
            mask[self.name2row[key]] = False
            self.matrix = self.matrix[mask, :]
def run_glove():
    logging.info('Starting training')
    with temp_chdir(args.glove_dir):
        run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data))

    # convert their format to ours
    df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None)
    logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df))
    # remove any shit-looking tokens, they'll get in the way later
    mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index]
    logging.info('Keeping %d entries', sum(mask))
    logging.info('Shape of vectors before filtering %r', df.shape)
    df = df[mask]
    logging.info('Shape of vectors after filtering %r', df.shape)
    cols = ['f%d' % i for i in range(df.shape[1])]
    mkdirs_if_not_exists(output_dir)
    write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
    def from_tsv(cls, tsv_file, sim_threshold=-1e20,
                 lowercasing=False, ngram_separator='_',
                 row_filter=lambda x, y: True,
                 column_filter=lambda x: True,
                 max_len=50, max_neighbours=1e8,
                 merge_duplicates=True,
                 immutable=True, **kwargs):
        """
        Changes the default value of the sim_threshold parameter of super. Features can have any value, including
        negative (especially when working with neural embeddings).
        :rtype: Vectors
        """
        # For vectors disallowing lexical overlap does not make sense at construction time, but should be
        # implemented in get_nearest_neighbours. A Thesaurus can afford to do the filtering when reading the
        # ready-made thesaurus from disk.
        allow_lexical_overlap = kwargs.pop('allow_lexical_overlap', True)
        if is_hdf(tsv_file):
            import pandas as pd

            df = pd.read_hdf(tsv_file, 'matrix')
            logging.info('Found a DF of shape %r in HDF file %s', df.shape, tsv_file)
            # pytables doesn't like unicode values and replaces them with an empty string.
            # pandas doesn't like duplicate values in index
            # remove these, we don't want to work with them anyway
            df = df[df.index != '']
            row_filter_mask = [row_filter(f, DocumentFeature.from_string(f)) for f in df.index]
            df = df[row_filter_mask]
            logging.info('Dropped non-ascii rows and applied row filter. Shape is now %r', df.shape)
            return DenseVectors(df, immutable=immutable,
                                allow_lexical_overlap=allow_lexical_overlap,
                                **kwargs)

        th = Thesaurus.from_tsv(tsv_file, sim_threshold=sim_threshold,
                                ngram_separator=ngram_separator,
                                allow_lexical_overlap=True,
                                row_filter=row_filter, column_filter=column_filter,
                                max_len=max_len, max_neighbours=max_neighbours,
                                merge_duplicates=merge_duplicates,
                                **kwargs)

        # get underlying dict from thesaurus
        if not th._obj:
            raise ValueError('No entries left over after filtering')
        return Vectors(th._obj, immutable=immutable,
                       allow_lexical_overlap=allow_lexical_overlap, **kwargs)
    def extract_features_from_tree_list(self, doc_sentences):
        """
        Turn a document (a list of sentences, each stored as a parse tree) into a sequence of features. Can extract
        features from the dependency trees (e.g. noun phrases) or traditional n-gram features.
        """
        features = []

        # extract sentence-internal token n-grams
        for parse_tree in doc_sentences:
            if not parse_tree:  # the sentence segmenter sometimes returns empty sentences
                continue

            features.extend(self.extract_features_from_single_dependency_tree(parse_tree))

            # extract sentence-internal n-grams of the right PoS tag
            if self.extract_unigram_features:
                # just unigrams, can get away without sorting the tokens
                for token in parse_tree.nodes_iter():
                    if token.pos not in self.extract_unigram_features:
                        continue
                    features.append(DocumentFeature('1-GRAM', (token,)))

            # some tests use standard bigrams, extract them too
            if self.standard_ngram_features > 1:
                # the tokens are stored as nodes in the parse tree in ANY order, sort them
                sentence = sorted(parse_tree.nodes(), key=attrgetter('index'))
                n_tokens = len(sentence)
                for n in range(2, min(self.standard_ngram_features + 1, n_tokens + 1)):
                    for i in range(n_tokens - n + 1):
                        feature = DocumentFeature('%d-GRAM' % n, tuple(sentence[i: i + n]))
                        features.append(feature)

        # it doesn't matter where in the sentence/document these features were found
        # erase their index
        for feature in features:
            for token in feature.tokens:
                token.index = 'any'

        # remove all features that aren't right- they are there because the code above doesnt
        # put the features through the validation code in DocumentFeature.from_string
        # e.g. the verb phrase "123/V_$$$/N" is not put through validation, so it will be returned as feature
        return [f for f in features if DocumentFeature.from_string(str(f)).type != 'EMPTY']
Example #23
0
    def __contains__(self, feature):
        """
        Accept all adjective-noun or noun-noun phrases where we have a corpus-observed vector for the head and
        a learnt matrix (through PLSR) for the modifier
        """
        # todo expand unit tests now that we have a real composer
        if feature.type not in self.entry_types:
            # ignore non-AN features
            return False

        modifier, head = feature.tokens
        assert ('J', 'N') == (modifier.pos, head.pos) or ('N', 'N') == (modifier.pos, head.pos)

        # if DocumentFeature('1-GRAM', (noun,)) not in self.unigram_source:
        if DocumentFeature.from_string(str(head)) not in self.unigram_source:
            # ignore ANs containing unknown nouns
            return False

        # ignore ANs containing unknown adjectives
        return str(modifier) in self.available_modifiers
Example #24
0
    def to_tsv(self, events_path, entries_path='', features_path='',
               entry_filter=lambda x: True, row_transform=lambda x: x):
        """
        Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the
        process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the
        features of each entry.

        :param events_path: file to write to
        :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will
         only be written if this callable return true
        :param row_transform: Callable, any transformation that might need to be done to each entry before converting
         it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not
         directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to
         DocumentFeature, e.g. if the data isn't PoS tagged.
        :return: the file name
        """
        rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()}
        write_vectors_to_disk(self.matrix.tocoo(), rows, self.columns, events_path,
                              features_path=features_path, entries_path=entries_path,
                              entry_filter=entry_filter)
        return events_path
def test_left_right_compose_all(left_comp):
    original_matrix, original_cols, original_rows = left_comp.unigram_source.to_sparse_matrix()
    matrix, cols, rows = left_comp.compose_all(['cat/N_game/N',
                                                DocumentFeature.from_string('dog/N_game/N'),
                                                'cat/N_a/N', 'cat/N_b/N', 'cat/N_c/N', 'cat/N_d/N', ])

    # the columns should remain unchanges
    assert original_cols == cols
    # the first rows are for the unigrams that existed before composition- 7 of them
    assert_array_equal(original_matrix.A, matrix.A[:7, :])
    # two new rows should appear, one for each composed feature
    # this should be reflected in both the index and the matrix
    assert rows['cat/N_game/N'] == 7
    assert rows['dog/N_game/N'] == 8
    assert matrix.shape == (13, 7) == (len(rows), len(cols))
    assert_array_equal(matrix.A[7, :], left_comp.unigram_source.get_vector('cat/N').A.ravel())
    assert_array_equal(matrix.A[8, :], left_comp.unigram_source.get_vector('dog/N').A.ravel())
    assert_array_equal(matrix.A[8, :], left_comp.unigram_source.get_vector('dog/N').A.ravel())

    for i in range(9, 12):
        assert_array_equal(matrix.A[i, :],
                           left_comp.unigram_source.get_vector('cat/N').A.ravel())
 def filter_preextracted_features(self, feature_list):
     """
     Takes a list of features and keeps only those mentioned in the constructor parameters. This is a minor
     optimisation- extraction is a little slow, so we can just extract tons of features in advance and then just
     filter them dynamically for each experiment
     :param feature_list:
     :return:
     """
     res = []
     for feat_str in feature_list:
         feat = DocumentFeature.from_string(feat_str)
         if feat.type == 'EMPTY':
             continue
         if feat.type == '1-GRAM' and feat.tokens[0].pos not in self.extract_unigram_features:
             continue
         if feat.type != '1-GRAM' and feat.type not in self.extract_phrase_features:
             continue
         if self.remove_features_with_NER and set(t.ner for t in feat.tokens) != {'O'}:
             continue
         if self.remove_pos:
             for token in feat.tokens:
                 token.pos = None
         res.append(feat)
     return res
def do_svd(input_path, output_prefix,
           desired_counts_per_feature_type=[('N', 8), ('V', 4), ('J', 4), ('RB', 2), ('AN', 2)],
           reduce_to=[3, 10, 15], apply_to=None, write=3, use_hdf=True):
    """

    Performs truncated SVD. A copy of the trained sklearn SVD estimator will be also be saved

    :param input_path: list of files containing vectors in TSV format. All vectors will be reduced together.
    :type input_path: list of file names or a Vectors object
    :param output_prefix: Where to output the reduced files. An extension will be added.
    :param desired_counts_per_feature_type: how many entries to keep of each DocumentFeature type, by frequency. This
     is the PoS tag for unigram features and the feature type otherwise. For instance, pass in [('N', 2), ('AN', 0)] to
    select 2 unigrams of PoS N and 0 bigrams of type adjective-noun. Types that are not explicitly given a positive
    desired count are treated as if the desired count is 0. If this is None, not filtering is performed.
    :param reduce_to: list of integers, what dimensionalities to reduce to
    :param apply_to: a file path. After SVD has been trained on input_path, it can be applied to
    apply_to. Output will be writen to the same file
    :param write: Once SVD is trained on A and applied to B, output either A, B or vstack(A, B). Use values 0,
    1, and 2 respectively. Default is 3.
    :param use_hdf: if true, store results as a pandas DF in HDF. This will enforce some constraints like not having
    duplicate entries in the index, which I deliberately break with some of the unit tests. This switch is the easiest
    way to avoid modifying the unit tests
    :type write: int
    :raise ValueError: If the loaded thesaurus is empty
    """
    if not 1 <= write <= 3:
        raise ValueError('value of parameter write must be 1, 2 or 3')

    if not isinstance(input_path, Vectors):
        thesaurus = Vectors.from_tsv(input_path, lowercasing=False)
    else:
        thesaurus = input_path

    if not thesaurus:
        raise ValueError('Empty thesaurus %r', input_path)
    mat, _, rows, cols = filter_out_infrequent_entries(desired_counts_per_feature_type, thesaurus)
    if apply_to:
        cols = set(cols)
        if not isinstance(apply_to, Vectors):
            thes_to_apply_to = Vectors.from_tsv(apply_to, lowercasing=False,
                                                column_filter=lambda foo: foo in cols)
        else:
            thes_to_apply_to = apply_to
        # get the names of each thesaurus entry
        extra_rows = [x for x in thes_to_apply_to.keys()]
        # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match
        # "project" second thesaurus into space of first thesaurus
        thesaurus.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))}
        extra_matrix = thesaurus.v.transform([dict(fv) for fv in thes_to_apply_to.values()])
        # make sure the shape is right
        assert extra_matrix.shape[1] == mat.shape[1]

        if write == 3:
            # extend the list of names
            rows = list(rows) + [DocumentFeature.from_string(x) for x in extra_rows]
        elif write == 2:
            rows = [DocumentFeature.from_string(x) for x in extra_rows]
            # no need to do anything if write == 1

    for n_components in reduce_to:
        method, reduced_mat = _do_svd_single(mat, n_components)
        if not method:
            continue
        if apply_to:
            logging.info('Applying learned SVD transform to matrix of shape %r', extra_matrix.shape)
            # apply learned transform to new data
            if write == 3:
                # append to old data
                reduced_mat = np.vstack((reduced_mat, method.transform(extra_matrix)))
            elif write == 2:
                reduced_mat = method.transform(extra_matrix)

        path = '{}-SVD{}'.format(output_prefix, n_components)
        _write_to_disk(scipy.sparse.coo_matrix(reduced_mat), path, rows, use_hdf=use_hdf)
def train_grefenstette_multistep_composer(all_vectors_file, root_dir):
    """
    Train Grefenstette et al's multistep regression VO/SVO model
    Adapted from dissect's ex19.py
    :param all_vectors_file: file containing N, V, VO and SVO vectors
    :param root_dir: where to write temp files and output
    """
    mkdirs_if_not_exists(root_dir)
    vo_composer_output_file = join(root_dir, 'vo_comp.pkl')
    svo_composer_output_file = join(root_dir, 'svo_comp.pkl')

    filename = basename(all_vectors_file)
    noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename)
    # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename)
    # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename)
    svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename)

    # this has unigrams and observed phrases
    thes = Vectors.from_tsv(all_vectors_file)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)
    # thes.to_tsv(verb_events_file,
    # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V')
    # _translate_byblo_to_dissect(verb_events_file)
    # thes.to_tsv(vo_events_file,
    #             entry_filter=lambda x: x.type == 'VO')
    # _translate_byblo_to_dissect(vo_events_file)
    thes.to_tsv(svo_events_file,
                entry_filter=lambda x: x.type == 'SVO')
    _translate_byblo_to_dissect(svo_events_file)

    train_vo_data, train_v_data = [], []
    for phrase in thes.keys():
        df = DocumentFeature.from_string(phrase)
        if df.type == 'SVO':
            train_vo_data.append((str(df[1:]), str(df[0]), str(df)))
        if df.type == 'VO':
            train_v_data.append((str(df[0]), str(df[1]), str(df)))

    # logging.info('train_vo_data %r', len(train_vo_data))
    # logging.info('train_v_data %r', len(train_v_data))

    # load N and SVO spaces
    n_space = Space.build(data=noun_events_file + '.sm',
                          cols=noun_events_file + '.cols',
                          format="sm")

    svo_space = Space.build(data=svo_events_file + '.sm',
                            cols=svo_events_file + '.cols',
                            format="sm")

    logging.info("Input SVO training space:")
    logging.info(svo_space.id2row)
    # logging.info(svo_space.cooccurrence_matrix)

    # 1. train a model to learn VO functions on train data: VO N -> SVO
    logging.info("Step 1 training")
    vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)  # Gref et al 2013, ยง5 says 3
    vo_model.train(train_vo_data, n_space, svo_space)
    io_utils.save(vo_model, vo_composer_output_file)

    # 2. train a model to learn V functions on train data: V N -> VO
    # where VO space: function space learned in step 1
    logging.info("Step 2 training")
    vo_space = vo_model.function_space
    v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)
    v_model.train(train_v_data, n_space, vo_space)
    io_utils.save(v_model, svo_composer_output_file)
def train_baroni_guevara_composers(all_vectors,
                                   ROOT_DIR,
                                   baroni_output_path, guevara_output_path,
                                   baroni_threshold=10):
    """

    :type all_vectors: str; path to vectors file containing both N and observed AN vectors
    :type ROOT_DIR: str; where to write temp files
    :type baroni_output_path: str; where to write pickled baroni composer
    :type guevara_output_path: str
    :type baroni_threshold: int
    """
    SVD_DIMS = 100
    baroni_training_phrase_types = {'AN', 'NN'}  # what kind of NPs to train Baroni composer for

    # prepare the input files to be fed into Dissect
    mkdirs_if_not_exists(ROOT_DIR)

    filename = basename(all_vectors)
    noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS))
    NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS))

    thes = Vectors.from_tsv(all_vectors, lowercasing=False)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)

    thes.to_tsv(NPs_events_file,
                entry_filter=lambda x: x.type in baroni_training_phrase_types,
                row_transform=lambda x: str(x).replace(' ', '_'))
    _translate_byblo_to_dissect(NPs_events_file)

    my_space = Space.build(data="{}.sm".format(noun_events_file),
                           rows="{}.rows".format(noun_events_file),
                           cols="{}.cols".format(noun_events_file),
                           format="sm")
    logging.info('Each unigram vector has dimensionality %r', my_space.element_shape)

    # create a peripheral space
    my_per_space = PeripheralSpace.build(my_space,
                                         data="{}.sm".format(NPs_events_file),
                                         rows="{}.rows".format(NPs_events_file),
                                         # The columns of the peripheral space have to be identical to those
                                         # in the core space (including their order)!
                                         cols="{}.cols".format(NPs_events_file),
                                         format="sm")
    logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape)

    # use the model to compose words in my_space
    all_data = []
    for phrase in my_per_space._row2id:
        # make sure there are only NPs here
        if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types:
            adj, noun = phrase.split('_')
            all_data.append((adj, noun, '%s_%s' % (adj, noun)))

    # train a composition model on the data and save it
    baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner())
    guevara = FullAdditive(learner=RidgeRegressionLearner())
    for composer, out_path in zip([baroni, guevara],
                                  [baroni_output_path, guevara_output_path]):
        composer.train(all_data, my_space, my_per_space)
        io_utils.save(composer, out_path)
        logging.info('Saved trained composer to %s', out_path)
Example #30
0
    def from_tsv(cls, tsv_file, sim_threshold=0, include_self=False,
                 lowercasing=False, ngram_separator='_', allow_lexical_overlap=True,
                 row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50,
                 max_neighbours=1e8, merge_duplicates=False, immutable=True,
                 enforce_word_entry_pos_format=True, tar=False, **kwargs):
        """
        Create a Thesaurus by parsing a Byblo-compatible TSV files (events or sims).
        If duplicate values are encoutered during parsing, only the latest will be kept.

        :param tsv_file: path to input TSV file
        :type tsv_file:  str
        :param sim_threshold: min similarity between an entry and its neighbour for the neighbour to be included
        :type sim_threshold: float
        :param include_self: whether to include self as nearest neighbour.
        :type include_self: bool
        :param lowercasing: if true, most of what is read will be lowercased (excluding PoS tags), so
            Cat/N -> cat/N. This is desirable when reading thesauri with this class. If False, no lowercasing
            will take place. This might be desirable when readings feature lists or already lowercased neighbour
            lists. FET + Byblo thesauri are already lowercased.
        :type lowercasing: bool
        :param ngram_separator: When n_gram entries are read in, what are the indidivual tokens separated by
        :param column_filter: A function that takes a string (column in the file) and returns whether or not
        the string should be kept
        :param row_filter: takes a string and its corresponding DocumentFeature and determines if it should be loaded.
        If `enforce_word_entry_pos_format` is `False`, the second parameter to this function will be `None`
        :param allow_lexical_overlap: whether neighbours/features are allowed to overlap lexically with the entry
        they are neighbours/features of. OTE: THE BEHAVIOUR OF THIS PARAMETER IS SLIGHTLY DIFFERENT FROM THE EQUIVALENT
        IN VECTORS. SEE COMMENT THERE.
        :param max_len: maximum length (in characters) of permissible **entries**. Longer entries are ignored.
        :param max_neighbours: maximum neighbours per entry. This is applied AFTER the filtering defined by
        column_filter and allow_lexical_overlap is finished.
        :param merge_duplicates: whether to raise en error if multiple entries exist, or concatenate/add them together.
        The former is appropriate for `Thesaurus`, and the latter for `Vectors`
        :param enforce_word_entry_pos_format: if true, entries that are not in a `word/POS` format are skipped. This
        must be true for `allow_lexical_overlap` to work.
        :param tar: whether the file is compressed by running `tar -zcvf file.gz file.txt`. Assuming the tar contains
        a single file.
        """

        if not tsv_file:
            raise ValueError("No thesaurus specified")

        to_return = dict()
        logging.info('Loading thesaurus %s from disk', tsv_file)
        gz_file = tsv_file + '.gz'
        if os.path.exists(gz_file) and tar:
            logging.warning('Using .gz version of thesaurus')
            tsv_file = gz_file
        if not allow_lexical_overlap:
            logging.warning('DISALLOWING LEXICAL OVERLAP')

        if not allow_lexical_overlap and not enforce_word_entry_pos_format:
            raise ValueError('allow_lexical_overlap requires entries to be converted to a DocumentFeature. '
                             'Please enable enforce_word_entry_pos_format')
        FILTERED = '___FILTERED___'.lower()

        if tar:
            tarf = tarfile.open(tsv_file, 'r')
            members = tarf.getmembers()
            if len(members) != 1:
                # todo this is odd, I don't know why it is happening
                # on some machine tar adds a second hidden file to the archive
                logging.warning('Tar archive contains multiple files: %r' % members)
                logging.warning('Using the last file in the tar')
            fhandle = tarf.extractfile(members[-1])
        else:
            fhandle = open(tsv_file)

        with fhandle as infile:
            for line in infile.readlines():
                if tar:
                    # this is a byte steam, needs to be decoded
                    tokens = line.decode('UTF8').strip().split('\t')
                else:
                    tokens = line.strip().split('\t')

                if len(tokens) % 2 == 0:
                    # must have an odd number of things, one for the entry
                    # and pairs for (neighbour, similarity)
                    logging.warning('Skipping dodgy line in thesaurus file: %s\n %s', tsv_file, line)
                    continue

                if tokens[0] != FILTERED:
                    key = DocumentFeature.smart_lower(tokens[0], ngram_separator, lowercasing)
                    dfkey = DocumentFeature.from_string(key) if enforce_word_entry_pos_format else None

                    if enforce_word_entry_pos_format and dfkey.type == 'EMPTY':
                        # do not load things in the wrong format, they'll get in the way later
                        logging.warning('%s is not in the word/POS format, skipping', tokens[0])
                        continue

                    if (not row_filter(key, dfkey)) or len(key) > max_len:
                        logging.warning('Skipping entry for %s', key)
                        continue

                    to_insert = [(DocumentFeature.smart_lower(word, ngram_separator, lowercasing), float(sim))
                                 for (word, sim) in walk_nonoverlapping_pairs(tokens, 1)
                                 if word.lower() != FILTERED and column_filter(word) and float(sim) > sim_threshold]

                    if not allow_lexical_overlap:
                        to_insert = cls.remove_overlapping_neighbours(dfkey, to_insert)

                    if len(to_insert) > max_neighbours:
                        to_insert = to_insert[:max_neighbours]

                    if include_self:
                        to_insert.insert(0, (key, 1.0))

                    # the steps above may filter out all neighbours of an entry. if this happens,
                    # do not bother adding it
                    if len(to_insert) > 0:
                        if key in to_return:  # this is a duplicate entry, merge it or raise an error
                            if merge_duplicates:
                                logging.warning('Multiple entries for "%s" found. Merging.', tokens[0])
                                c = Counter(dict(to_return[key]))
                                c.update(dict(to_insert))
                                to_return[key] = [(k, v) for k, v in c.items()]
                            else:
                                raise ValueError('Multiple entries for "%s" found.' % tokens[0])
                        else:
                            to_return[key] = to_insert
                    else:
                        logging.warning('Nothing survived filtering for %r', key)
        return Thesaurus(to_return, immutable=immutable)