def test_with_different_separators(): DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!') assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \ DocumentFeature.from_string('very_RB!big_J') DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ') assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB') assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \ DocumentFeature.from_string('very-RB big-J')
def test_document_feature_slicing(): DocumentFeature.recompile_pattern() x = DocumentFeature.from_string('big/J_cat/N') assert x[0] == DocumentFeature.from_string('big/J') assert x[1] == DocumentFeature.from_string('cat/N') assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), )) assert x[0:] == DocumentFeature.from_string('big/J_cat/N') x = DocumentFeature.from_string('cat/N') assert x[0] == DocumentFeature.from_string('cat/N') assert x[0:] == DocumentFeature.from_string('cat/N') assert x[:] == DocumentFeature.from_string('cat/N')
def test_smart_lower(): # test that the PoS of an n-gram entry is not lowercased assert DocumentFeature.smart_lower('Cat/N') == 'cat/N' assert DocumentFeature.smart_lower('Cat/n') == 'cat/n' assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N' # test that features are not touched assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former' DocumentFeature.recompile_pattern(ngram_separator=' ') assert DocumentFeature.smart_lower('Red/J CaT/N') == 'red/J cat/N' DocumentFeature.recompile_pattern(pos_separator='-') assert DocumentFeature.smart_lower('Red-J') == 'red-J'
def test_document_feature_from_string(): DocumentFeature.recompile_pattern() x = DocumentFeature.from_string('big/J_cat/N') y = DocumentFeature('AN', (Token('big', 'J'), Token('cat', 'N'))) assert y == x assert DocumentFeature('1-GRAM', (Token('cat', 'N'), )) == DocumentFeature.from_string('cat/N') assert DocumentFeature('VO', (Token('chase', 'V'), Token('cat', 'N'))) == \ DocumentFeature.from_string('chase/V_cat/N') assert DocumentFeature('NN', (Token('dog', 'N'), Token('cat', 'N'))) == \ DocumentFeature.from_string('dog/N_cat/N') assert DocumentFeature('NN', (Token('dog', 'N'), Token('cat', 'N'))) == \ DocumentFeature.from_string('dog/n_cat/n') assert DocumentFeature('3-GRAM', (Token('dog', 'V'), Token('chase', 'V'), Token('cat', 'V'))) == \ DocumentFeature.from_string('dog/V_chase/V_cat/V') assert DocumentFeature('2-GRAM', (Token('chase', 'V'), Token('cat', 'V'))) == \ DocumentFeature.from_string('chase/V_cat/V') assert DocumentFeature('SVO', (Token('dog', 'N'), Token('chase', 'V'), Token('cat', 'N'))) == \ DocumentFeature.from_string('dog/N_chase/V_cat/N') assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \ DocumentFeature.from_string('very/RB_big/J') assert DocumentFeature('2-GRAM', (Token('very', None), Token('big', None))) == \ DocumentFeature.from_string('very_big') for invalid_string in ['a\/s/N', 'l\/h/N_clinton\/south/N', 'l\/h//N_clinton\/south/N', 'l//fasdlj/fasd/dfs/sdf', 'l//fasdlj/fasd/dfs\_/sdf', 'dfs\_/sdf', 'dfs\_/fadslk_/sdf', '/_dfs\_/sdf', '_/_/', '_///f_/', 'drop/V_bomb', '/V_/N', 'word1_word2//', 'mk8/N_6hp/N', 'a./N_gordon/N', 'great/J_c.d./N', '[email protected]/N', 'w1/N', '-lrb-306-rrb- 569-1995/N', 'mumaharps.com/N', 'c+l+a+v+i+e+r+/N', 'b/N_o\o/N', '%/N', '|/V', '-lrb-852-rrb- 2829 6281/N']: print(invalid_string) assert DocumentFeature('EMPTY', tuple()) == DocumentFeature.from_string(invalid_string)
def from_tsv(cls, tsv_file, sim_threshold=0, include_self=False, lowercasing=False, ngram_separator='_', pos_separator='/', allow_lexical_overlap=True, row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50, max_neighbours=1e8, merge_duplicates=False, immutable=True, enforce_word_entry_pos_format=True, **kwargs): """ Create a Thesaurus by parsing a Byblo-compatible TSV files (events or sims). If duplicate values are encoutered during parsing, only the latest will be kept. :param tsv_file: path to input TSV file. May be gzipped. :type tsv_file: str :param sim_threshold: min similarity between an entry and its neighbour for the neighbour to be included :type sim_threshold: float :param include_self: whether to include self as nearest neighbour. :type include_self: bool :param lowercasing: if true, most of what is read will be lowercased (excluding PoS tags), so Cat/N -> cat/N. This is desirable when reading thesauri with this class. If False, no lowercasing will take place. This might be desirable when readings feature lists or already lowercased neighbour lists. FET + Byblo thesauri are already lowercased. :type lowercasing: bool :param ngram_separator: When n_gram entries are read in, what are the indidivual tokens separated by :param column_filter: A function that takes a string (column in the file) and returns whether or not the string should be kept :param row_filter: takes a string and its corresponding DocumentFeature and determines if it should be loaded. If `enforce_word_entry_pos_format` is `False`, the second parameter to this function will be `None` :param allow_lexical_overlap: whether neighbours/features are allowed to overlap lexically with the entry they are neighbours/features of. OTE: THE BEHAVIOUR OF THIS PARAMETER IS SLIGHTLY DIFFERENT FROM THE EQUIVALENT IN VECTORS. SEE COMMENT THERE. :param max_len: maximum length (in characters) of permissible **entries**. Longer entries are ignored. :param max_neighbours: maximum neighbours per entry. This is applied AFTER the filtering defined by column_filter and allow_lexical_overlap is finished. :param merge_duplicates: whether to raise en error if multiple entries exist, or concatenate/add them together. The former is appropriate for `Thesaurus`, and the latter for `Vectors` :param enforce_word_entry_pos_format: if true, entries that are not in a `word/POS` format are skipped. This must be true for `allow_lexical_overlap` to work. """ if not tsv_file: raise ValueError("No thesaurus specified") DocumentFeature.recompile_pattern(pos_separator=pos_separator, ngram_separator=ngram_separator) to_return = dict() logging.info('Loading thesaurus %s from disk', tsv_file) if not allow_lexical_overlap: logging.warning('DISALLOWING LEXICAL OVERLAP') if not allow_lexical_overlap and not enforce_word_entry_pos_format: raise ValueError('allow_lexical_overlap requires entries to be converted to a DocumentFeature. ' 'Please enable enforce_word_entry_pos_format') FILTERED = '___FILTERED___'.lower() gzipped = is_gzipped(tsv_file) if gzipped: logging.info('Attempting to read a gzipped file') fhandle = gzip.open(tsv_file) else: fhandle = open(tsv_file) with fhandle as infile: for line in infile.readlines(): if gzipped: # this is a byte steam, needs to be decoded tokens = line.decode('UTF8').strip().split('\t') else: tokens = line.strip().split('\t') if len(tokens) % 2 == 0: # must have an odd number of things, one for the entry # and pairs for (neighbour, similarity) logging.warning('Skipping dodgy line in thesaurus file: %s\n %s', tsv_file, line) continue if tokens[0] != FILTERED: key = DocumentFeature.smart_lower(tokens[0], lowercasing) dfkey = DocumentFeature.from_string(key) if enforce_word_entry_pos_format else None if enforce_word_entry_pos_format and dfkey.type == 'EMPTY': # do not load things in the wrong format, they'll get in the way later # logging.warning('%s is not in the word/POS format, skipping', tokens[0]) continue if (not row_filter(key, dfkey)) or len(key) > max_len: logging.debug('Skipping entry for %s', key) continue to_insert = [(DocumentFeature.smart_lower(word, lowercasing), float(sim)) for (word, sim) in walk_nonoverlapping_pairs(tokens, 1) if word.lower() != FILTERED and column_filter(word) and float(sim) > sim_threshold] if not allow_lexical_overlap: to_insert = cls.remove_overlapping_neighbours(dfkey, to_insert) if len(to_insert) > max_neighbours: to_insert = to_insert[:max_neighbours] if include_self: to_insert.insert(0, (key, 1.0)) # the steps above may filter out all neighbours of an entry. if this happens, # do not bother adding it if len(to_insert) > 0: if key in to_return: # this is a duplicate entry, merge it or raise an error if merge_duplicates: logging.debug('Multiple entries for "%s" found. Merging.', tokens[0]) c = Counter(dict(to_return[key])) c.update(dict(to_insert)) to_return[key] = [(k, v) for k, v in c.items()] else: raise ValueError('Multiple entries for "%s" found.' % tokens[0]) else: to_return[key] = to_insert else: logging.warning('Nothing survived filtering for %r', key) return Thesaurus(to_return, immutable=immutable)
def test_token_to_string(): assert 'dog/J' == str(DocumentFeature.from_string('dog/J').tokens[0]) DocumentFeature.recompile_pattern(pos_separator='-') my_feature = DocumentFeature.from_string('dog-J') assert 'dog-J' == str(my_feature) DocumentFeature.recompile_pattern()