Beispiel #1
0
    def test_whitespace_nlp(self):
        raw = '''Hi! My name
		is Jason.  You can call me
		Mr. J.  Is that your name too?
		Ha. Ha ha.
		'''
        doc = whitespace_nlp(raw)
        self.assertEqual(len(list(doc)), 73)
        self.assertEqual(len(doc.sents), 1)
        tok = Tok('WORD', 'Jason', 'jason', 'Name', 'NNP')
        self.assertEqual(len(tok), 5)
        self.assertEqual(str(tok), 'jason')
        self.assertEqual(
            str(
                Doc([[
                    Tok('WORD', 'Jason', 'jason', 'Name', 'NNP'),
                    Tok('WORD', 'a', 'a', 'Name', 'NNP')
                ]],
                    raw='asdfbasdfasd')), 'asdfbasdfasd')
        self.assertEqual(
            str(
                Doc([[
                    Tok('WORD', 'Blah', 'blah', 'Name', 'NNP'),
                    Tok('Space', ' ', ' ', ' ', ' '),
                    Tok('WORD', 'a', 'a', 'Name', 'NNP')
                ]])), 'blah a')
def build_hamlet_jz_df():
	# type: () -> pd.DataFrame
	categories, documents = get_docs_categories()
	clean_function = lambda text: '' if text.startswith('[') else text
	df = pd.DataFrame({
		'category': categories,
		'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents]
	})
	df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)]
	return df
def build_hamlet_jz_df():
    # type: () -> pd.DataFrame
    categories, documents = get_docs_categories()
    clean_function = lambda text: '' if text.startswith('[') else text
    df = pd.DataFrame({
        'category':
        categories,
        'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents]
    })
    df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)]
    return df
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'author': ['a', 'a', 'c', 'c', 'c',
		                                  'c', 'd', 'd', 'e', 'e'],
		                       'parsed': cls.parsed_docs,
		                       'document_lengths': [len(doc) for doc in cls.documents]})
		cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def build_hamlet_jz_corpus():
    # type: () -> Corpus
    categories, documents = get_docs_categories()
    clean_function = lambda text: '' if text.startswith('[') else text
    df = pd.DataFrame({
        'category':
        categories,
        'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents]
    })
    df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)]
    return CorpusFromParsedDocuments(df=df,
                                     category_col='category',
                                     parsed_col='parsed').build()
Beispiel #6
0
 def setUp(cls):
     cls.categories, cls.documents = get_docs_categories()
     cls.parsed_docs = []
     for doc in cls.documents:
         cls.parsed_docs.append(whitespace_nlp(doc))
     cls.df = pd.DataFrame({
         'category':
         cls.categories,
         'author': ['a', 'a', 'c', 'c', 'c', 'c', 'd', 'd', 'e', 'e'],
         'parsed':
         cls.parsed_docs,
         'document_lengths': [len(doc) for doc in cls.documents]
     })
     cls.corpus = CorpusFromParsedDocuments(cls.df, 'category',
                                            'parsed').build()
def build_hamlet_jz_corpus_with_meta():
	# type: () -> Corpus
	def empath_mock(doc, **kwargs):
		toks = list(doc)
		num_toks = min(3,len(toks))
		return {'cat'+str(len(tok)):val for val,tok in enumerate(toks[:num_toks])}

	categories, documents = get_docs_categories()
	clean_function = lambda text: '' if text.startswith('[') else text
	df = pd.DataFrame({
		'category': categories,
		'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents]
	})
	df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)]
	return CorpusFromParsedDocuments(
		df=df,
		category_col='category',
		parsed_col='parsed',
		feats_from_spacy_doc=FeatsFromSpacyDocAndEmpath(empath_analyze_function=empath_mock)
	).build()
	def test_whitespace_nlp(self):
		raw = '''Hi! My name
		is Jason.  You can call me
		Mr. J.  Is that your name too?
		Ha. Ha ha.
		'''
		doc = whitespace_nlp(raw)
		self.assertEqual(len(list(doc)), 55)
		self.assertEqual(len(doc.sents), 1)
		tok = Tok('WORD', 'Jason', 'jason', 'Name', 'NNP')
		self.assertEqual(len(tok), 5)
		self.assertEqual(str(tok), 'jason')
		self.assertEqual(str(Doc([[Tok('WORD', 'Jason', 'jason', 'Name', 'NNP'),
		                           Tok('WORD', 'a', 'a', 'Name', 'NNP')]],
		                         raw='asdfbasdfasd')),
		                 'asdfbasdfasd')
		self.assertEqual(str(Doc([[Tok('WORD', 'Blah', 'blah', 'Name', 'NNP'),
		                           Tok('Space', ' ', ' ', ' ', ' '),
		                           Tok('WORD', 'a', 'a', 'Name', 'NNP')]])),
		                 'blah a')
def build_hamlet_jz_corpus_with_meta():
    # type: () -> Corpus
    def empath_mock(doc, **kwargs):
        toks = doc.split()
        num_toks = min(3, len(toks))
        return {
            'cat' + str(len(tok)): val
            for val, tok in enumerate(toks[:num_toks])
        }

    categories, documents = get_docs_categories()
    clean_function = lambda text: '' if text.startswith('[') else text
    df = pd.DataFrame({
        'category':
        categories,
        'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents]
    })
    df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)]
    return CorpusFromParsedDocuments(
        df=df,
        category_col='category',
        parsed_col='parsed',
        feats_from_spacy_doc=FeatsFromSpacyDocAndEmpath(
            empath_analyze_function=empath_mock)).build()
 def test_get_feats(self):
     doc = whitespace_nlp("A a bb cc.")
     term_freq = UnigramsFromSpacyDoc().get_feats(doc)
     self.assertEqual(Counter({'a': 2, 'bb': 1, 'cc': 1}), term_freq)