def strip_short(s, minsize=3): """Remove words with length lesser than `minsize` from `s`. Parameters ---------- s : str minsize : int, optional Returns ------- str Unicode string without short words. Examples -------- >>> from samenvattr.parsing.preprocessing import strip_short >>> strip_short("salut les amis du 59") u'salut les amis' >>> >>> strip_short("one two three four five six seven eight nine ten", minsize=5) u'three seven eight' """ s = utils.to_unicode(s) return " ".join(e for e in s.split() if len(e) >= minsize)
def strip_non_alphanum(s): """Remove non-alphabetic characters from `s` using :const:`~samenvattr.parsing.preprocessing.RE_NONALPHA`. Parameters ---------- s : str Returns ------- str Unicode string with alphabetic characters only. Notes ----- Word characters - alphanumeric & underscore. Examples -------- >>> from samenvattr.parsing.preprocessing import strip_non_alphanum >>> strip_non_alphanum("if-you#can%read$this&then@this#method^works") u'if you can read this then this method works' """ s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s)
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) if lineno == 0: if line.strip().isdigit(): # Older versions of save_as_text may not write num_docs on first line. result.num_docs = int(line.strip()) continue else: logging.warning( "Text does not contain num_docs on the first line." ) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError( 'token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
def test_keywords_runs(self): text = self._get_text_from_test_data("mihalcea_tarau.txt") kwds = keywords(text) self.assertTrue(len(kwds.splitlines())) kwds_u = keywords(utils.to_unicode(text)) self.assertTrue(len(kwds_u.splitlines())) kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst))
def test_repeated_keywords(self): text = self._get_text_from_test_data("testrepeatedkeywords.txt") kwds = keywords(text) self.assertTrue(len(kwds.splitlines())) kwds_u = keywords(utils.to_unicode(text)) self.assertTrue(len(kwds_u.splitlines())) kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst))
def __iter__(self): """ Iteratively yield vectors from the underlying file, in the format (row_no, vector), where vector is a list of (col_no, value) 2-tuples. Note that the total number of vectors returned is always equal to the number of rows specified in the header; empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. """ with utils.file_or_filename(self.input) as lines: self.skip_headers(lines) previd = -1 for line in lines: docid, termid, val = utils.to_unicode( line).split() # needed for python3 if not self.transposed: termid, docid = docid, termid # -1 because matrix market indexes are 1-based => convert to 0-based docid, termid, val = int(docid) - 1, int(termid) - 1, float( val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: # change of document: return the document read so far (its id is prevId) if previd >= 0: yield previd, document # noqa:F821 # return implicit (empty) documents between previous id and new id # too, to keep consistent document numbering and corpus length for previd in range(previd + 1, docid): yield previd, [] # from now on start adding fields to a new document, with a new id previd = docid document = [] document.append(( termid, val, )) # add another field to the current document # handle the last document, as a special case if previd >= 0: yield previd, document # return empty documents between the last explicit document and the number # of documents as specified in the header for previd in range(previd + 1, self.num_docs): yield previd, []
def __init__(self, input, transposed=True): """ Initialize the matrix reader. The `input` refers to a file on local filesystem, which is expected to be in the sparse (coordinate) Matrix Market format. Documents are assumed to be rows of the matrix (and document features are columns). `input` is either a string (file path) or a file-like object that supports `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are not closed automatically. """ logger.info("initializing corpus reader from %s", input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith( '%%matrixmarket matrix coordinate real general'): raise ValueError( "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header)) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = ( int(x) for x in line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info( "accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz)
def from_corpus(corpus, id2word=None): """ Create Dictionary from an existing corpus. This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original text corpus. This will scan the term-document count matrix for all word ids that appear in it, then construct and return Dictionary which maps each `word_id -> id2word[word_id]`. `id2word` is an optional dictionary that maps the `word_id` to a token. In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used. """ result = Dictionary() max_id = -1 for docno, document in enumerate(corpus): if docno % 10000 == 0: logger.info("adding document #%i to %s", docno, result) result.num_docs += 1 result.num_nnz += len(document) for wordid, word_freq in document: max_id = max(wordid, max_id) result.num_pos += word_freq result.dfs[wordid] = result.dfs.get(wordid, 0) + 1 if id2word is None: # make sure length(result) == get_max_id(corpus) + 1 result.token2id = {str(i): i for i in range(max_id + 1)} else: # id=>word mapping given: simply copy it result.token2id = { utils.to_unicode(token): idx for idx, token in id2word.items() } for idx in result.token2id.values(): # make sure all token ids have a valid `dfs` entry result.dfs[idx] = result.dfs.get(idx, 0) logger.info("built %s from %i documents (total %i corpus positions)", result, result.num_docs, result.num_pos) return result
def remove_stopwords(s): """Remove :const:`~samenvattr.parsing.preprocessing.STOPWORDS` from `s`. Parameters ---------- s : str Returns ------- str Unicode string without :const:`~samenvattr.parsing.preprocessing.STOPWORDS`. Examples -------- >>> from samenvattr.parsing.preprocessing import remove_stopwords >>> remove_stopwords("Better late than never, but better never late.") u'Better late never, better late.' """ s = utils.to_unicode(s) return " ".join(w for w in s.split() if w not in STOPWORDS)
def strip_tags(s): """Remove tags from `s` using :const:`~samenvattr.parsing.preprocessing.RE_TAGS`. Parameters ---------- s : str Returns ------- str Unicode string without tags. Examples -------- >>> from samenvattr.parsing.preprocessing import strip_tags >>> strip_tags("<i>Hello</i> <b>World</b>!") u'Hello World!' """ s = utils.to_unicode(s) return RE_TAGS.sub("", s)
def strip_punctuation(s): """Replace punctuation characters with spaces in `s` using :const:`~samenvattr.parsing.preprocessing.RE_PUNCT`. Parameters ---------- s : str Returns ------- str Unicode string without punctuation characters. Examples -------- >>> from samenvattr.parsing.preprocessing import strip_punctuation >>> strip_punctuation("A semicolon is a stronger break than a comma, but not as much as a full stop!") u'A semicolon is a stronger break than a comma but not as much as a full stop ' """ s = utils.to_unicode(s) return RE_PUNCT.sub(" ", s)
def strip_numeric(s): """Remove digits from `s` using :const:`~samenvattr.parsing.preprocessing.RE_NUMERIC`. Parameters ---------- s : str Returns ------- str Unicode string without digits. Examples -------- >>> from samenvattr.parsing.preprocessing import strip_numeric >>> strip_numeric("0text24samenvattr365test") u'textsamenvattrtest' """ s = utils.to_unicode(s) return RE_NUMERIC.sub("", s)
def preprocess_string(s, filters=DEFAULT_FILTERS): """Apply list of chosen filters to `s`. Default list of filters: * :func:`~samenvattr.parsing.preprocessing.strip_tags`, * :func:`~samenvattr.parsing.preprocessing.strip_punctuation`, * :func:`~samenvattr.parsing.preprocessing.strip_multiple_whitespaces`, * :func:`~samenvattr.parsing.preprocessing.strip_numeric`, * :func:`~samenvattr.parsing.preprocessing.remove_stopwords`, * :func:`~samenvattr.parsing.preprocessing.strip_short`, * :func:`~samenvattr.parsing.preprocessing.stem_text`. Parameters ---------- s : str filters: list of functions, optional Returns ------- list of str Processed strings (cleaned). Examples -------- >>> from samenvattr.parsing.preprocessing import preprocess_string >>> preprocess_string("<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3 weather_is really g00d today, isn't it?") [u'hel', u'rld', u'weather', u'todai', u'isn'] >>> >>> s = "<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3 weather_is really g00d today, isn't it?" >>> CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation] >>> preprocess_string(s, CUSTOM_FILTERS) [u'hel', u'9lo', u'wo9', u'rld', u'th3', u'weather', u'is', u'really', u'g00d', u'today', u'isn', u't', u'it'] """ s = utils.to_unicode(s) for f in filters: s = f(s) return s.split()
def stem_text(text): """Transform `s` into lowercase and stem it. Parameters ---------- text : str Returns ------- str Unicode lowercased and porter-stemmed version of string `text`. Examples -------- >>> from samenvattr.parsing.preprocessing import stem_text >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.") u'while it is quit us to be abl to search a larg collect of document almost instantly.' """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def split_alphanum(s): """Add spaces between digits & letters in `s` using :const:`~samenvattr.parsing.preprocessing.RE_AL_NUM`. Parameters ---------- s : str Returns ------- str Unicode string with spaces between digits & letters. Examples -------- >>> from samenvattr.parsing.preprocessing import split_alphanum >>> split_alphanum("24.0hours7 days365 a1b2c3") u'24.0 hours 7 days 365 a 1 b 2 c 3' """ s = utils.to_unicode(s) s = RE_AL_NUM.sub(r"\1 \2", s) return RE_NUM_AL.sub(r"\1 \2", s)
def strip_multiple_whitespaces(s): r"""Remove repeating whitespace characters (spaces, tabs, line breaks) from `s` and turns tabs & line breaks into spaces using :const:`~samenvattr.parsing.preprocessing.RE_WHITESPACE`. Parameters ---------- s : str Returns ------- str Unicode string without repeating in a row whitespace characters. Examples -------- >>> from samenvattr.parsing.preprocessing import strip_multiple_whitespaces >>> strip_multiple_whitespaces("salut" + '\r' + " les" + '\n' + " loulous!") u'salut les loulous!' """ s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s)
def test_mz_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f: text = utils.to_unicode(f.read()) text = u' '.join(text.split()[:10240]) kwds = mz_keywords(text) self.assertTrue(kwds.startswith('autism')) self.assertTrue(kwds.endswith('uk')) self.assertTrue(len(kwds.splitlines())) kwds_lst = mz_keywords(text, split=True) self.assertTrue(len(kwds_lst)) # Automatic thresholding selects words with n_blocks / n_blocks+1 # bits of entropy. For this text, n_blocks=10 n_blocks = 10. kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.)))
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters ---------- text: str Document for summarization. blocksize: int, optional Size of blocks to use in analysis. scores: bool, optional Whether to return score with keywords. split: bool, optional Whether to return results as list. weighted: bool, optional Whether to weight scores by word frequency. False can useful for shorter texts, and allows automatic thresholding. threshold: float or 'auto', optional Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8, use 'auto' with `weighted=False`. Returns ------- results: str newline separated keywords if `split` == False **OR** results: list(str) list of keywords if `scores` == False **OR** results: list(tuple(str, float)) list of (keyword, score) tuples if `scores` == True Results are returned in descending order of score regardless of the format. Note ---- This algorithm looks for keywords that contribute to the structure of the text on scales of `blocksize` words of larger. It is suitable for extracting keywords representing the major themes of long texts. References ---------- .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558 """ text = to_unicode(text) words = [word for word in _tokenize_by_word(text)] vocab = sorted(set(words)) word_counts = numpy.array( [ [words[i:i + blocksize].count(word) for word in vocab] for i in range(0, len(words), blocksize) ] ).astype('d') n_blocks = word_counts.shape[0] totals = word_counts.sum(axis=0) n_words = totals.sum() p = word_counts / totals log_p = numpy.log2(p) h = numpy.nan_to_num(p * log_p).sum(axis=0) analytic = __analytic_entropy(blocksize, n_blocks, n_words) h += analytic(totals).astype('d') if weighted: h *= totals / n_words if threshold == 'auto': threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8 weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold] weights.sort(key=lambda x: -x[1]) result = weights if scores else [word for (word, score) in weights] if not (scores or split): result = '\n'.join(result) return result