Exemple #1
0
def test_japanese():
    reader = get_reader("simplenlp.ja")
    connections = list(reader.extract_connections(u"私は日本語をあんまり出来ません。困りましたね。"))
    pos_doc_terms = [term2 for weight, term1, term2 in connections if term1 == DOCUMENT and weight > 0]
    neg_doc_terms = [term2 for weight, term1, term2 in connections if term1 == DOCUMENT and weight < 0]

    assert u"出来る" in neg_doc_terms
    assert u"日本語" in neg_doc_terms
    assert u"日本語 出来る" in neg_doc_terms
    assert u"困る" in pos_doc_terms
Exemple #2
0
 def get_document_connections(self, docid):
     """
     Given a previously added document, get the list of connections
     produced from it.
     """
     if docid in self.connections_cache:
         connections = self.connections_cache[docid]
     else:
         doc = self.database.get_document(docid)
         reader = get_reader(doc.reader)
         connections = list(reader.extract_connections(doc.text))
     return connections
Exemple #3
0
 def vector_from_text(self, text, reader_name=None):
     """
     Get a category vector in this model representing the given text,
     with TF-IDF applied.
     """
     if reader_name is None:
         reader_name = self.config['reader']
     reader = get_reader(reader_name)
     terms = []
     for weight, term1, term2 in reader.extract_connections(text):
         if term1 == DOCUMENT:
             terms.append((term2, weight))
     return self.vector_from_terms(terms)
Exemple #4
0
    def add_document(self, doc, reader_name=None):
        """
        Take in a document, pass it through the reader, and store its terms
        in the term database.

        The document should be expressed as a dictionary, containing at least
        these keys:
        
        - name: the unique identifier for the document
        - text: the plain text of the document, possibly including text-encoded
          tags
        - url: a unique identifier for the document, preferably one that
          actually locates it relative to the study

        Optionally, it may contain:
        
        - tags: (key, value) tuples representing tags
        """
        LOG.info("Reading document: %r" % doc['url'])
        if reader_name is None:
            reader_name = self.config['reader']
        reader = get_reader(reader_name)
        text = doc['text']
        tags = doc.get('tags', [])
        doc_terms = []
        connections = list(reader.extract_connections(text))
        self.connections_cache[doc['url']] = connections
        for weight, term1, term2 in connections:
            if term1 == DOCUMENT:
                if isinstance(term2, tuple) and term2[0] == TAG:
                    tags.append(term2[1:])
                else:
                    doc_terms.append((term2, weight))
                    relevance = self.database.term_relevance(term2)
                    self.index_term(term2, relevance)

        doc['reader'] = reader_name
        doc['terms'] = doc_terms
        doc['tags'] = tags
        self.database.add_document(doc)
        self.idf_cache = {}   # invalidate the cache of term IDFs
        return doc['url']
Exemple #5
0
    def add_batch(self, stream_func, study=None, learn_iterations=0):
        """
        Add a batch of documents from some source, a `stream_func` that
        when called returns an iterator over the documents.
        """
        fulltext_cache = {}
        self.connections_cache = {}

        # First pass: add documents to the term database, and meanwhile
        # collect full texts and tags.
        for doc in stream_func():
            docid = self.add_document(doc)
            reader = get_reader(doc['reader'])
            for term, fulltext in reader.extract_term_texts(doc['text']):
                fulltext_cache[term] = fulltext
            if study is not None:
                self.database.set_tag_on_document(docid, 'study', study)
        
        LOG.info("Committing documents to the database")
        self.database.commit()

        LOG.info("Collecting relevant terms")
        self.database.update_relevance()

        if learn_iterations:
            # Second pass (optional): find how much we should update the
            # ReconstructedMatrix entries based on the word associations
            # we discover.
            learn_accumulator = defaultdict(float)
            for doc in stream_func():
                for weight, term1, term2\
                 in self.document_assoc_updates(doc['url']):
                    if term1 in self.priority and term2 in self.priority:
                        learn_accumulator[(term1, term2)] += weight

            # Now actually apply those total updates. Multiple times, if asked.
            total = len(learn_accumulator)
            for iter in xrange(learn_iterations):
                LOG.info("Updating association matrix: pass %d" % (iter+1))
                i = 0
                avg_err = 1.0
                for term1, term2 in learn_accumulator:
                    i += 1
                    if (i % 100) == 0:
                        LOG.info("Learned %d/%d; err=%4.4f"
                                 % (i, total, avg_err))
                    weight = learn_accumulator[(term1, term2)]
                    err = self.learn_assoc(weight, term1, term2)
                    avg_err = (.999 * avg_err) + (.001 * err)
        
        # Finally, update the full texts of the terms we saw.
        LOG.info("Updating full texts")
        for term, fulltext in fulltext_cache.items():
            self.database.set_term_text(term, fulltext)
        self.database.commit()
        
        # If this was a study, make a document matrix for it.
        if study is not None:
            LOG.info("Making document matrix for %r" % study)
            self.update_doc_matrix(study)
        LOG.info("Updating tag matrix")
        self.update_tag_matrix()
Exemple #6
0
def test_simple():
    reader = get_reader("simplenlp.en")
    tokenized = list(reader.extract_tokens("one two three four"))
    assert_equal(tokenized, ["two", "three", "four"])
Exemple #7
0
def test_no_reader():
    get_reader("simplenlp.foo")