def add_word_to_stringstore(self, word, path): try: self.stringstore = StringStore().from_disk(path) self.stringstore.add(word) except: self.stringstore = StringStore(word) self.stringstore.to_disk(path)
def __init__( self, shape: tuple = (1000, 128), strings: StringStore = None, senses: List[str] = [], vectors_name: str = "sense2vec", overrides: Dict[str, str] = SimpleFrozenDict(), ): """Initialize the Sense2Vec object. shape (tuple): The vector shape. strings (StringStore): Optional string store. Will be created if it doesn't exist. senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. vectors_name (unicode): Optional name to assign to the Vectors object. overrides (dict): Optional custom functions to use, mapped to names registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2Vec): The newly constructed object. """ self.vectors = Vectors(shape=shape, name=vectors_name) self._row2key = None self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} self.cache = None self.cfg: Dict[str, Any] = { "senses": senses, "make_key": "default", "split_key": "default", } self.cfg.update(overrides)
def test_dump_load(sstore): id_ = sstore[u'qqqqq'] with tempfile.TemporaryFile('w+t') as file_: sstore.dump(file_) file_.seek(0) new_store = StringStore() new_store.load(file_) assert new_store[id_] == u'qqqqq'
def add_stringstore_to_vocab_temporarely(self, file): try: self.stringstore = StringStore().from_disk(file) for word in self.stringstore: lex = self.nlp.vocab[word] self.nlp.vocab[word].is_oov = False except: print("cannot read stringstore in file " + file)
def test_dump_load(sstore): id_ = sstore[u'qqqqq'] loc = '/tmp/sstore.json' with io.open(loc, 'w', encoding='utf8') as file_: sstore.dump(file_) new_store = StringStore() with io.open(loc, 'r', encoding='utf8') as file_: new_store.load(file_) assert new_store[id_] == u'qqqqq'
def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a bytestring. bytes_data (bytes): The data to load. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) self.freqs = dict(data.get("freqs", [])) self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) return self
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size, min_ngram, max_ngram): logging.info('Processing batch_id: {}'.format(batch_id)) subtrees = PreshCounter() subtrees_string_map = StringStore() noun_chunks = PreshCounter() noun_chunks_string_map = StringStore() if lang.lower() == "en": from spacy.en import English NLU = English() NLU.matcher = None elif lang.lower() == "id": from spacy.id import Indonesian NLU = Indonesian() NLU.matcher = None for i, doc in enumerate( NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)): phrases = set() for tok in doc: st_len = len(list(tok.subtree)) if min_ngram <= st_len <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in tok.subtree]).strip() orth = subtrees_string_map[st] subtrees.inc(orth, 1) for np in doc.noun_chunks: if min_ngram <= len(np) <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in np]).strip() orth = noun_chunks_string_map[st] noun_chunks.inc(orth, 1) if i % batch_size == 0: logging.info('Processing batch_id: {}, doc: {}'.format( batch_id, i)) output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in subtrees: st = subtrees_string_map[orth] if count >= 5 and '!LONGWORD!' not in st: out.write('{}\t{}\n'.format(count, st)) output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in noun_chunks: if count >= 5: st = noun_chunks_string_map[orth] out.write('{}\t{}\n'.format(count, st))
def build_doc_term_matrix(terms_lists, weighted=True): ''' Construct a sparse document/term matrix, optionally weighted by the position of the terms in each document (i.e. in a SERP) :param terms_lists: list of urls :param weighted: weight by rank :return: sparse matrix of urls ''' stringstore = StringStore() data = [] rows = [] cols = [] for row_idx, terms_list in enumerate(terms_lists): bow = tuple((stringstore[term] - 1, 1. / (i**2 + 2) if weighted else 1) for i, term in enumerate(terms_list) if term) data.extend(count for _, count in bow) cols.extend(term_id for term_id, _ in bow) rows.extend(itertools.repeat(row_idx, times=len(bow))) #import pdb;pdb.set_trace() doc_term_matrix = sp.coo_matrix((data, (rows, cols)), dtype=float if weighted else int).tocsr() return doc_term_matrix
def test_serialize_stringstore_roundtrip_disk(strings1, strings2): sstore1 = StringStore(strings=strings1) sstore2 = StringStore(strings=strings2) with make_tempdir() as d: file_path1 = d / "strings1" file_path2 = d / "strings2" sstore1.to_disk(file_path1) sstore2.to_disk(file_path2) sstore1_d = StringStore().from_disk(file_path1) sstore2_d = StringStore().from_disk(file_path2) assert list(sstore1_d) == list(sstore1) assert list(sstore2_d) == list(sstore2) if strings1 == strings2: assert list(sstore1_d) == list(sstore2_d) else: assert list(sstore1_d) != list(sstore2_d)
def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus. Args: spacy_docs (list(``spacy.Doc``)) spacy_vocab (``spacy.Vocab``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list(list((int, int))): list of bag-of-words documents, where each doc is a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() gcorpus = [] stringstore = StringStore() doc_freqs = Counter() for spacy_doc in spacy_docs: if lemmatize is True: bow = ( (spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items()) else: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.ORTH).items()) if filter_stops is True: bow = ((lex, count) for lex, count in bow if not lex.is_stop) if filter_punct is True: bow = ((lex, count) for lex, count in bow if not lex.is_punct) if filter_nums is True: bow = ((lex, count) for lex, count in bow if not lex.like_num) bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow), key=itemgetter(0)) doc_freqs.update(tok_id for tok_id, _ in bow) gdict.num_docs += 1 gdict.num_pos += sum(count for _, count in bow) gdict.num_nnz += len(bow) gcorpus.append(bow) gdict.token2id = {s: i for i, s in enumerate(stringstore)} gdict.dfs = dict(doc_freqs) return (gdict, gcorpus)
def sort(path: Path): """Sort the strings from the vocabulary of a spaCy model. For the original code of StringStore.to_disk(), see https://github.com/explosion/spaCy/blob/53a3b967ac704ff0a67a7102ede6d916e2a4545a/spacy/strings.pyx#L219-L227. """ st = StringStore().from_disk(path) strings = sorted(st) srsly.write_json(path, strings)
def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a directory. path (unicode / Path): The path to load from. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ path = Path(path) strings_path = path / "strings.json" freqs_path = path / "freqs.json" self.vectors = Vectors().from_disk(path) self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) return self
def test_pickle_string_store(text1, text2): stringstore = StringStore() store1 = stringstore[text1] store2 = stringstore[text2] data = srsly.pickle_dumps(stringstore, protocol=-1) unpickled = srsly.pickle_loads(data) assert unpickled[text1] == store1 assert unpickled[text2] == store2 assert len(stringstore) == len(unpickled)
def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True, lowercase=False, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert a sequence of ``spacy.Doc`` s into a gensim-friendly corpus and a string that can be loaded into a :class:`gensim.corpora.Dictionary`. Args: spacy_docs (Iterable[``spacy.Doc``]) spacy_vocab (``spacy.Vocab``) lemmatize (bool): if True, use lemmatized strings for words lowercase (bool): if True (and ``lemmatize`` is False), use lowercased strings for words filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: str: words, their integer ids, and their document frequencies in ``spacy_docs``, as a string formatted like `id[TAB]word[TAB]df[NEWLINE]`; when written to file, can be converted into a gensim ``Dictionary`` via :meth:`gensim.corpora.Dictionary.load_from_text()` List[List[Tuple[int, int]]]: list of documents as bags-of-words, where each doc is a list of (integer word ID, word count) 2-tuples """ count_by = (attrs.LEMMA if lemmatize is True else attrs.LOWER if lowercase is True else attrs.ORTH) gcorpus = [] stringstore = StringStore() doc_freqs = Counter() for spacy_doc in spacy_docs: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(count_by).items()) bow = ((lex, count) for lex, count in bow if not lex.is_space) if filter_stops is True: bow = ((lex, count) for lex, count in bow if not lex.is_stop) if filter_punct is True: bow = ((lex, count) for lex, count in bow if not lex.is_punct) if filter_nums is True: bow = ((lex, count) for lex, count in bow if not lex.like_num) bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow), key=itemgetter(0)) doc_freqs.update(tok_id for tok_id, _ in bow) gcorpus.append(bow) gdict_str = '\n'.join( '{}\t{}\t{}'.format(i, s, doc_freqs[i]) for i, s in sorted(enumerate(stringstore), key=itemgetter(1))) return (gdict_str, gcorpus)
def test_unapplicable_trees(): strings = StringStore() trees = EditTrees(strings) tree3 = trees.add("deelt", "delen") # Replacement fails. assert trees.apply(tree3, "deeld") == None # Suffix + prefix are too large. assert trees.apply(tree3, "de") == None
def test_dutch(): strings = StringStore() trees = EditTrees(strings) tree = trees.add("deelt", "delen") assert trees.tree_to_str( tree) == "(m 0 3 () (m 0 2 (s '' 'l') (s 'lt' 'n')))" tree = trees.add("gedeeld", "delen") assert (trees.tree_to_str(tree) == "(m 2 3 (s 'ge' '') (m 0 2 (s '' 'l') (s 'ld' 'n')))")
def __init__(self): import os from sagas.conf.conf import cf from pyltp import Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller from spacy.strings import StringStore self.stringstore = StringStore() MODELDIR = f'{cf.conf_dir}/ai/ltp/ltp_data_v3.4.0' self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) par_model_path = os.path.join(MODELDIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(MODELDIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) self.conf = AnalConf('zh') self.conf.setup(self)
def test_encode_decode(self): strings = StringStore() hello_id = strings[u'Hello'] world_id = strings[u'World'] self.assertNotEqual(hello_id, world_id) self.assertEqual(strings[hello_id], u'Hello') self.assertEqual(strings[world_id], u'World') self.assertEqual(strings[u'Hello'], hello_id) self.assertEqual(strings[u'World'], world_id)
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2): sstore1 = StringStore(strings=strings1) sstore2 = StringStore(strings=strings2) sstore1_b = sstore1.to_bytes() sstore2_b = sstore2.to_bytes() if strings1 == strings2: assert sstore1_b == sstore2_b else: assert sstore1_b != sstore2_b sstore1 = sstore1.from_bytes(sstore1_b) assert sstore1.to_bytes() == sstore1_b new_sstore1 = StringStore().from_bytes(sstore1_b) assert new_sstore1.to_bytes() == sstore1_b assert list(new_sstore1) == strings1
def word_movers(doc1, doc2, metric='cosine'): """ Measure the semantic similarity between two documents using Word Movers Distance. Args: doc1 (``textacy.Doc`` or ``spacy.Doc``) doc2 (``textacy.Doc`` or ``spacy.Doc``) metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'}) Returns: float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0], where larger values correspond to more similar documents References: Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances," in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009. Kusner, Matt J., et al. "From word embeddings to document distances." Proceedings of the 32nd International Conference on Machine Learning (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf """ stringstore = StringStore() n = 0 word_vecs = [] for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)): if word.has_vector: if stringstore[ word.text] - 1 == n: # stringstore[0] always empty space word_vecs.append(word.vector) n += 1 distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(np.double) distance_mat /= distance_mat.max() vec1 = collections.Counter(stringstore[word.text] - 1 for word in extract.words(doc1) if word.has_vector) vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore)) ]).astype(np.double) vec1 /= vec1.sum() # normalize word counts vec2 = collections.Counter(stringstore[word.text] - 1 for word in extract.words(doc2) if word.has_vector) vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore)) ]).astype(np.double) vec2 /= vec2.sum() # normalize word counts return 1.0 - emd(vec1, vec2, distance_mat)
def merge_counts(locs, out_loc): string_map = StringStore() counts = PreshCounter() for loc in locs: with io.open(loc, 'r', encoding='utf8') as file_: for line in file_: freq, word = line.strip().split('\t', 1) orth = string_map[word] counts.inc(orth, int(freq)) with io.open(out_loc, 'w', encoding='utf8') as file_: for orth, count in counts: string = string_map[orth] file_.write('%d\t%s\n' % (count, string))
def test_from_to_bytes(): strings = StringStore() trees = EditTrees(strings) trees.add("deelt", "delen") trees.add("gedeeld", "delen") b = trees.to_bytes() trees2 = EditTrees(strings) trees2.from_bytes(b) # Verify that the nodes did not change. assert len(trees) == len(trees2) for i in range(len(trees)): assert trees.tree_to_str(i) == trees2.tree_to_str(i) # Reinserting the same trees should not add new nodes. trees2.add("deelt", "delen") trees2.add("gedeeld", "delen") assert len(trees) == len(trees2)
def test_from_to_disk(): strings = StringStore() trees = EditTrees(strings) trees.add("deelt", "delen") trees.add("gedeeld", "delen") trees2 = EditTrees(strings) with make_tempdir() as temp_dir: trees_file = temp_dir / "edit_trees.bin" trees.to_disk(trees_file) trees2 = trees2.from_disk(trees_file) # Verify that the nodes did not change. assert len(trees) == len(trees2) for i in range(len(trees)): assert trees.tree_to_str(i) == trees2.tree_to_str(i) # Reinserting the same trees should not add new nodes. trees2.add("deelt", "delen") trees2.add("gedeeld", "delen") assert len(trees) == len(trees2)
def morphology(): morphology = Morphology(StringStore()) morphology.add("Feat1=Val1|Feat2=Val2") morphology.add("Feat3=Val3|Feat4=Val4") return morphology
def test_create(self): lemmatizer = Lemmatizer({}, {}, {}) strings = StringStore() lemmatizer = Lemmatizer({}, {}, {}) morphology = Morphology(strings, {}, lemmatizer)
class Sense2Vec(object): def __init__( self, shape: tuple = (1000, 128), strings: StringStore = None, senses: List[str] = [], vectors_name: str = "sense2vec", overrides: Dict[str, str] = SimpleFrozenDict(), ): """Initialize the Sense2Vec object. shape (tuple): The vector shape. strings (StringStore): Optional string store. Will be created if it doesn't exist. senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. vectors_name (unicode): Optional name to assign to the Vectors object. overrides (dict): Optional custom functions to use, mapped to names registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2Vec): The newly constructed object. """ self.vectors = Vectors(shape=shape, name=vectors_name) self._row2key = None self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} self.cache = None self.cfg: Dict[str, Any] = { "senses": senses, "make_key": "default", "split_key": "default", } self.cfg.update(overrides) @property def senses(self) -> Sequence[str]: """RETURNS (list): The available senses.""" return self.cfg.get("senses", []) @property def frequencies(self) -> List[Tuple[str, int]]: """RETURNS (list): The (key, freq) tuples by frequency, descending.""" freqs = [(self.strings[k], s) for k, s in self.freqs.items() if s is not None] return sorted(freqs, key=lambda item: item[1], reverse=True) def __len__(self) -> int: """RETURNS (int): The number of rows in the vectors table.""" return len(self.vectors) def __contains__(self, key: Union[str, int]) -> bool: """Check if a key is in the vectors table. key (unicode / int): The key to look up. RETURNS (bool): Whether the key is in the table. """ key = self.ensure_int_key(key) return key in self.vectors def __getitem__(self, key: Union[str, int]) -> Union[numpy.ndarray, None]: """Retrieve a vector for a given key. Returns None if the key is not in the table. key (unicode / int): The key to look up. RETURNS (numpy.ndarray): The vector. """ key = self.ensure_int_key(key) if key in self.vectors: return self.vectors[key] return None def __setitem__(self, key: Union[str, int], vector: numpy.ndarray): """Set a vector for a given key. Will raise an error if the key doesn't exist. key (unicode / int): The key. vector (numpy.ndarray): The vector to set. """ key = self.ensure_int_key(key) if key not in self.vectors: raise ValueError(f"Can't find key {key} in table") self.vectors[key] = vector self._row2key = None def __iter__(self): """YIELDS (tuple): String key and vector pairs in the table.""" yield from self.items() def items(self): """YIELDS (tuple): String key and vector pairs in the table.""" for key, value in self.vectors.items(): yield self.strings[key], value def keys(self): """YIELDS (unicode): The string keys in the table.""" for key in self.vectors.keys(): yield self.strings[key] def values(self): """YIELDS (numpy.ndarray): The vectors in the table.""" yield from self.vectors.values() @property def row2key(self): if not self._row2key: self._row2key = { row: key for key, row in self.vectors.key2row.items() } return self._row2key @property def make_key(self) -> Callable: """Get the function to make keys.""" return registry.make_key.get(self.cfg["make_key"]) @property def split_key(self) -> Callable: """Get the function to split keys.""" return registry.split_key.get(self.cfg["split_key"]) def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): """Add a new vector to the table. key (unicode / int): The key to add. vector (numpy.ndarray): The vector to add. freq (int): Optional frequency count. """ if not isinstance(key, int): key = self.strings.add(key) self.vectors.add(key, vector=vector) if freq is not None: self.set_freq(key, freq) self._row2key = None def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]: """Get the frequency count for a given key. key (unicode / int): They key to look up. default: Default value to return if no frequency is found. RETURNS (int): The frequency count. """ key = self.ensure_int_key(key) return self.freqs.get(key, default) def set_freq(self, key: Union[str, int], freq: int): """Set a frequency count for a given key. key (unicode / int): The key to set the count for. freq (int): The frequency count. """ if not isinstance(freq, int): raise ValueError( f"Invalid frequency count: {repr(freq)} for '{key}'") key = self.ensure_int_key(key) self.freqs[key] = freq def ensure_int_key(self, key: Union[str, int]) -> int: """Ensure that a key is an int by looking it up in the string store. key (unicode / int): The key. RETURNS (int): The integer key. """ return key if isinstance(key, int) else self.strings.add(key) def similarity( self, keys_a: Union[Sequence[Union[str, int]], str, int], keys_b: Union[Sequence[Union[str, int]], str, int], ) -> float: """Make a semantic similarity estimate of two keys or two sets of keys. The default estimate is cosine similarity using an average of vectors. keys_a (unicode / int / iterable): The string or integer key(s). keys_b (unicode / int / iterable): The other string or integer key(s). RETURNS (float): The similarity score. """ if isinstance(keys_a, (str, int)): keys_a = [keys_a] if isinstance(keys_b, (str, int)): keys_b = [keys_b] average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0) average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0) return cosine_similarity(average_a, average_b) def most_similar( self, keys: Union[Sequence[Union[str, int]], str, int], n: int = 10, batch_size: int = 16, ) -> List[Tuple[str, float]]: """Get the most similar entries in the table. If more than one key is provided, the average of the vectors is used. keys (unicode / int / iterable): The string or integer key(s) to compare to. n (int): The number of similar keys to return. batch_size (int): The batch size to use. RETURNS (list): The (key, score) tuples of the most similar vectors. """ if isinstance(keys, (str, int)): keys = [keys] for key in keys: if key not in self: raise ValueError(f"Can't find key {key} in table") if self.cache and self.cache["indices"].shape[1] >= n: n = min(len(self.vectors), n) key = self.ensure_int_key(key) key_row = self.vectors.find(key=key) if key_row < self.cache["indices"].shape[0]: rows = self.cache["indices"][key_row, :n] scores = self.cache["scores"][key_row, :n] entries = zip(rows, scores) entries = [(self.strings[self.row2key[r]], score) for r, score in entries if r in self.row2key] return entries # Always ask for more because we'll always get the keys themselves n = min(len(self.vectors), n + len(keys)) rows = numpy.asarray(self.vectors.find(keys=keys)) vecs = self.vectors.data[rows] average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( average, n=n, batch_size=batch_size) result = list(zip(result_keys.flatten(), scores.flatten())) result = [(self.strings[key], score) for key, score in result if key] result = [(key, score) for key, score in result if key not in keys] return result def get_other_senses(self, key: Union[str, int], ignore_case: bool = True) -> List[str]: """Find other entries for the same word with a different sense, e.g. "duck|VERB" for "duck|NOUN". key (unicode / int): The key to check. ignore_case (bool): Check for uppercase, lowercase and titlecase. RETURNS (list): The string keys of other entries with different senses. """ result = [] key = key if isinstance(key, str) else self.strings[key] word, orig_sense = self.split_key(key) versions = [word, word.upper(), word.title() ] if ignore_case else [word] for text in versions: for sense in self.senses: new_key = self.make_key(text, sense) if sense != orig_sense and new_key in self: result.append(new_key) return result def get_best_sense(self, word: str, senses: Sequence[str] = tuple(), ignore_case: bool = True) -> Union[str, None]: """Find the best-matching sense for a given word based on the available senses and frequency counts. Returns None if no match is found. word (unicode): The word to check. senses (list): Optional list of senses to limit the search to. If not set / empty, all senses in the vectors are used. ignore_case (bool): Check for uppercase, lowercase and titlecase. RETURNS (unicode): The best-matching key or None. """ sense_options = senses or self.senses if not sense_options: return None versions = [word, word.upper(), word.title() ] if ignore_case else [word] freqs = [] for text in versions: for sense in sense_options: key = self.make_key(text, sense) if key in self: freq = self.get_freq(key, -1) freqs.append((freq, key)) return max(freqs)[1] if freqs else None def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: """Serialize a Sense2Vec object to a bytestring. exclude (list): Names of serialization fields to exclude. RETURNS (bytes): The serialized Sense2Vec object. """ vectors_bytes = self.vectors.to_bytes() freqs = list(self.freqs.items()) data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs} if "strings" not in exclude: data["strings"] = self.strings.to_bytes() if "cache" not in exclude: data["cache"] = self.cache return srsly.msgpack_dumps(data) def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a bytestring. bytes_data (bytes): The data to load. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) self.freqs = dict(data.get("freqs", [])) self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) if "cache" not in exclude and "cache" in data: self.cache = data.get("cache", {}) self._row2key = None return self def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Serialize a Sense2Vec object to a directory. path (unicode / Path): The path. exclude (list): Names of serialization fields to exclude. """ path = Path(path) self.vectors.to_disk(path) srsly.write_json(path / "cfg", self.cfg) srsly.write_json(path / "freqs.json", list(self.freqs.items())) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") if "cache" not in exclude and self.cache: srsly.write_msgpack(path / "cache", self.cache) def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a directory. path (unicode / Path): The path to load from. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ path = Path(path) strings_path = path / "strings.json" freqs_path = path / "freqs.json" cache_path = path / "cache" self.vectors = Vectors().from_disk(path) self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) if "cache" not in exclude and cache_path.exists(): self.cache = srsly.read_msgpack(cache_path) self._row2key = None return self
def build_doc_term_matrix(terms_lists, weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None): """ Build a document-term matrix of shape (# docs, # unique terms) from a sequence of documents, each represented as a sequence of (str) terms, with a variety of weighting and normalization schemes for the matrix values. Args: terms_lists (iterable(iterable(str))): a sequence of documents, each as a sequence of (str) terms; note that the terms in each doc are to be counted, so these probably shouldn't be sets containing *unique* terms; example inputs:: >>> ([tok.lemma_ for tok in spacy_doc] ... for spacy_doc in spacy_docs) >>> ((ne.text for ne in doc.named_entities()) ... for doc in textcorpus) >>> (tuple(ng.text for ng in itertools.chain.from_iterable(doc.ngrams(i) for i in range(1, 3))) ... for doc in docs) weighting (str {'tf', 'tfidf', 'binary'}, optional): if 'tf', matrix values (i, j) correspond to the number of occurrences of term j in doc i; if 'tfidf', term frequencies (tf) are multiplied by their corresponding inverse document frequencies (idf); if 'binary', all non-zero values are set equal to 1 normalize (bool, optional): if True, normalize term frequencies by the L2 norms of the vectors binarize (bool, optional): if True, set all term frequencies greater than 0 equal to 1 sublinear_tf (bool, optional): if True, apply sub-linear term-frequency scaling, i.e. tf => 1 + log(tf) smooth_idf (bool, optional): if True, add 1 to all document frequencies, equivalent to adding a single document to the corpus containing every unique term min_df (float or int, optional): if float, value is the fractional proportion of the total number of documents, which must be in [0.0, 1.0]; if int, value is the absolute number; filter terms whose document frequency is less than ``min_df`` max_df (float or int, optional): if float, value is the fractional proportion of the total number of documents, which must be in [0.0, 1.0]; if int, value is the absolute number; filter terms whose document frequency is greater than ``max_df`` min_ic (float, optional): filter terms whose information content is less than `min_ic`; value must be in [0.0, 1.0] max_n_terms (int, optional): only include terms whose document frequency is within the top ``max_n_terms`` Returns: :class:`scipy.sparse.csr_matrix <scipy.sparse.csr_matrix>`: sparse matrix of shape (# docs, # unique terms), where value (i, j) is the weight of term j in doc i dict: id to term mapping, where keys are unique integers as term ids and values are corresponding strings """ stringstore = StringStore() data = []; rows = []; cols = [] for row_idx, terms_list in enumerate(terms_lists): # an empty string always occupies index 0 in the stringstore, which causes # an empty first col in the doc-term matrix that we don't want; # so, we subtract 1 from the stringstore's assigned id bow = tuple((stringstore[term] - 1, count) for term, count in collections.Counter(terms_list).items() if term) data.extend(count for _, count in bow) cols.extend(term_id for term_id, _ in bow) rows.extend(itertools.repeat(row_idx, times=len(bow))) doc_term_matrix = sp.coo_matrix((data, (rows, cols)), dtype=int).tocsr() # ignore the 0-index empty string in stringstore, as above id_to_term = {term_id - 1: term for term_id, term in enumerate(stringstore) if term_id != 0} # filter terms by document frequency or information content? if max_df != 1.0 or min_df != 1 or max_n_terms is not None: doc_term_matrix, id_to_term = filter_terms_by_df( doc_term_matrix, id_to_term, max_df=max_df, min_df=min_df, max_n_terms=max_n_terms) if min_ic != 0.0: doc_term_matrix, id_to_term = filter_terms_by_ic( doc_term_matrix, id_to_term, min_ic=min_ic, max_n_terms=max_n_terms) if weighting == 'binary': doc_term_matrix = binarize_mat(doc_term_matrix, threshold=0.0, copy=False) else: if sublinear_tf is True: doc_term_matrix = doc_term_matrix.astype(np.float64) np.log(doc_term_matrix.data, doc_term_matrix.data) doc_term_matrix.data += 1 if weighting == 'tfidf': doc_term_matrix = apply_idf_weighting(doc_term_matrix, smooth_idf=smooth_idf) if normalize is True: doc_term_matrix = normalize_mat(doc_term_matrix, norm='l2', axis=1, copy=False) return (doc_term_matrix, id_to_term)
def morphology(): lemmatizer = Lemmatizer(Lookups()) return Morphology(StringStore(), {}, lemmatizer)
def sstore(): return StringStore()
def test_stringstore_to_bytes(stringstore, text): store = stringstore.add(text) serialized = stringstore.to_bytes() new_stringstore = StringStore().from_bytes(serialized) assert new_stringstore[store] == text
def stringstore(): return StringStore()