def test_japanese(): reader = get_reader("simplenlp.ja") connections = list(reader.extract_connections(u"私は日本語をあんまり出来ません。困りましたね。")) pos_doc_terms = [term2 for weight, term1, term2 in connections if term1 == DOCUMENT and weight > 0] neg_doc_terms = [term2 for weight, term1, term2 in connections if term1 == DOCUMENT and weight < 0] assert u"出来る" in neg_doc_terms assert u"日本語" in neg_doc_terms assert u"日本語 出来る" in neg_doc_terms assert u"困る" in pos_doc_terms
def get_document_connections(self, docid): """ Given a previously added document, get the list of connections produced from it. """ if docid in self.connections_cache: connections = self.connections_cache[docid] else: doc = self.database.get_document(docid) reader = get_reader(doc.reader) connections = list(reader.extract_connections(doc.text)) return connections
def vector_from_text(self, text, reader_name=None): """ Get a category vector in this model representing the given text, with TF-IDF applied. """ if reader_name is None: reader_name = self.config['reader'] reader = get_reader(reader_name) terms = [] for weight, term1, term2 in reader.extract_connections(text): if term1 == DOCUMENT: terms.append((term2, weight)) return self.vector_from_terms(terms)
def add_document(self, doc, reader_name=None): """ Take in a document, pass it through the reader, and store its terms in the term database. The document should be expressed as a dictionary, containing at least these keys: - name: the unique identifier for the document - text: the plain text of the document, possibly including text-encoded tags - url: a unique identifier for the document, preferably one that actually locates it relative to the study Optionally, it may contain: - tags: (key, value) tuples representing tags """ LOG.info("Reading document: %r" % doc['url']) if reader_name is None: reader_name = self.config['reader'] reader = get_reader(reader_name) text = doc['text'] tags = doc.get('tags', []) doc_terms = [] connections = list(reader.extract_connections(text)) self.connections_cache[doc['url']] = connections for weight, term1, term2 in connections: if term1 == DOCUMENT: if isinstance(term2, tuple) and term2[0] == TAG: tags.append(term2[1:]) else: doc_terms.append((term2, weight)) relevance = self.database.term_relevance(term2) self.index_term(term2, relevance) doc['reader'] = reader_name doc['terms'] = doc_terms doc['tags'] = tags self.database.add_document(doc) self.idf_cache = {} # invalidate the cache of term IDFs return doc['url']
def add_batch(self, stream_func, study=None, learn_iterations=0): """ Add a batch of documents from some source, a `stream_func` that when called returns an iterator over the documents. """ fulltext_cache = {} self.connections_cache = {} # First pass: add documents to the term database, and meanwhile # collect full texts and tags. for doc in stream_func(): docid = self.add_document(doc) reader = get_reader(doc['reader']) for term, fulltext in reader.extract_term_texts(doc['text']): fulltext_cache[term] = fulltext if study is not None: self.database.set_tag_on_document(docid, 'study', study) LOG.info("Committing documents to the database") self.database.commit() LOG.info("Collecting relevant terms") self.database.update_relevance() if learn_iterations: # Second pass (optional): find how much we should update the # ReconstructedMatrix entries based on the word associations # we discover. learn_accumulator = defaultdict(float) for doc in stream_func(): for weight, term1, term2\ in self.document_assoc_updates(doc['url']): if term1 in self.priority and term2 in self.priority: learn_accumulator[(term1, term2)] += weight # Now actually apply those total updates. Multiple times, if asked. total = len(learn_accumulator) for iter in xrange(learn_iterations): LOG.info("Updating association matrix: pass %d" % (iter+1)) i = 0 avg_err = 1.0 for term1, term2 in learn_accumulator: i += 1 if (i % 100) == 0: LOG.info("Learned %d/%d; err=%4.4f" % (i, total, avg_err)) weight = learn_accumulator[(term1, term2)] err = self.learn_assoc(weight, term1, term2) avg_err = (.999 * avg_err) + (.001 * err) # Finally, update the full texts of the terms we saw. LOG.info("Updating full texts") for term, fulltext in fulltext_cache.items(): self.database.set_term_text(term, fulltext) self.database.commit() # If this was a study, make a document matrix for it. if study is not None: LOG.info("Making document matrix for %r" % study) self.update_doc_matrix(study) LOG.info("Updating tag matrix") self.update_tag_matrix()
def test_simple(): reader = get_reader("simplenlp.en") tokenized = list(reader.extract_tokens("one two three four")) assert_equal(tokenized, ["two", "three", "four"])
def test_no_reader(): get_reader("simplenlp.foo")