def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus. Args: spacy_docs (list(``spacy.Doc``)) spacy_vocab (``spacy.Vocab``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list(list((int, int))): list of bag-of-words documents, where each doc is a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() gcorpus = [] stringstore = StringStore() doc_freqs = Counter() for spacy_doc in spacy_docs: if lemmatize is True: bow = ( (spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items()) else: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.ORTH).items()) if filter_stops is True: bow = ((lex, count) for lex, count in bow if not lex.is_stop) if filter_punct is True: bow = ((lex, count) for lex, count in bow if not lex.is_punct) if filter_nums is True: bow = ((lex, count) for lex, count in bow if not lex.like_num) bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow), key=itemgetter(0)) doc_freqs.update(tok_id for tok_id, _ in bow) gdict.num_docs += 1 gdict.num_pos += sum(count for _, count in bow) gdict.num_nnz += len(bow) gcorpus.append(bow) gdict.token2id = {s: i for i, s in enumerate(stringstore)} gdict.dfs = dict(doc_freqs) return (gdict, gcorpus)
def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus. Args: spacy_docs (list(``spacy.Doc``)) spacy_vocab (``spacy.Vocab``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list(list((int, int))): list of bag-of-words documents, where each doc is a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() gcorpus = [] stringstore = StringStore() doc_freqs = Counter() for spacy_doc in spacy_docs: if lemmatize is True: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items()) else: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.ORTH).items()) if filter_stops is True: bow = ((lex, count) for lex, count in bow if not lex.is_stop) if filter_punct is True: bow = ((lex, count) for lex, count in bow if not lex.is_punct) if filter_nums is True: bow = ((lex, count) for lex, count in bow if not lex.like_num) bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow), key=itemgetter(0)) doc_freqs.update(tok_id for tok_id, _ in bow) gdict.num_docs += 1 gdict.num_pos += sum(count for _, count in bow) gdict.num_nnz += len(bow) gcorpus.append(bow) gdict.token2id = {s: i for i, s in enumerate(stringstore)} gdict.dfs = dict(doc_freqs) return (gdict, gcorpus)
def pytopia2gensimDict(dict_): ''' Creates gensim dictionary from a pytopia dictionary. This is necessary since building of gensim models requires gensim dictionary but pytopia model builders must be able to receive generic pytopia Dictionary as parameter. ''' # sort dictionary tokens by index dict_ = resolve(dict_) toki = [(tok, dict_[tok]) for tok in dict_] toki.sort(key=lambda ti: ti[1]) # directly set gensim dict data structures, # this works for gensim 0.12.4 gdict = GensimDict() gdict.token2id = {tok: i for tok, i in toki} gdict.id2token = {i: tok for tok, i in toki} gdict.dfs = {tok: 1 for tok, _ in toki} gdict.num_docs = 1 # number of documents processed gdict.num_pos = len(toki) # total number of corpus positions gdict.num_nnz = len(toki) # total number of non-zeroes in the BOW matrix return gdict