Beispiel #1
0
def test_noun_chunks_min_freq(spacy_doc):
    expected = ['I', 'we', 'he', 'I', 'we', 'he', 'we']
    observed = [
        nc.text for nc in extract.noun_chunks(
            spacy_doc, drop_determiners=True, min_freq=2)
    ]
    assert observed == expected
Beispiel #2
0
 def test_noun_chunks_min_freq(self):
     expected = ['I', 'we', 'he', 'I', 'we', 'he', 'we']
     observed = [
         nc.text for nc in extract.noun_chunks(
             self.spacy_doc, drop_determiners=True, min_freq=2)
     ]
     self.assertEqual(observed, expected)
Beispiel #3
0
def select_extractors(use_unigrams=False):
    """
    Extractors For Alignment
    :return: List of Extractors objects to use for text-text alignment
    note: ngram extractors below filter out stopwords and number words/symbols
    """
    noun_chunk_extractor = Extractor(
        lambda doc: list(filter(lambda x: len(x) > 3, list(noun_chunks(doc)))))
    tetragram_extractor = Extractor(
        lambda doc: list(ngrams(doc, 4, filter_stops=True, filter_nums=True)))
    trigram_extractor = Extractor(
        lambda doc: list(ngrams(doc, 3, filter_stops=True, filter_nums=True)))
    bigram_extractor = Extractor(lambda doc: list(
        ngrams(doc, 2, filter_stops=False, filter_nums=False)))
    unigram_extractor = Extractor(lambda doc: list(
        ngrams(doc, 1, filter_stops=False, filter_nums=False)))

    extractor_list = [
        noun_chunk_extractor,
        tetragram_extractor,
        trigram_extractor,
        bigram_extractor,
    ]

    if use_unigrams:
        extractor_list.append(unigram_extractor)

    return extractor_list
Beispiel #4
0
    def noun_chunks(self, **kwargs):
        """
        Extract an ordered sequence of noun phrases from doc, optionally
        filtering by frequency and dropping leading determiners.

        .. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>`
        for all function kwargs.
        """
        return extract.noun_chunks(self.spacy_doc, **kwargs)
Beispiel #5
0
def test_noun_chunks(spacy_doc):
    expected = [
        'I', 'Kuwait', 'I.M.F. seminar', 'Arab educators', '30 minutes', 'we',
        'impact', 'technology trends', 'education', 'Middle East'
    ]
    observed = [
        nc.text for nc in extract.noun_chunks(spacy_doc, drop_determiners=True)
    ][:10]
    assert observed == expected
Beispiel #6
0
    def noun_chunks(self, **kwargs):
        """
        Extract an ordered sequence of noun phrases from doc, optionally
        filtering by frequency and dropping leading determiners.

        .. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>`
        for all function kwargs.
        """
        return extract.noun_chunks(self.spacy_doc, **kwargs)
Beispiel #7
0
 def test_noun_chunks(self):
     expected = [
         'I', 'Kuwait', 'I.M.F. seminar', 'Arab educators', '30 minutes', 'we',
         'impact', 'technology trends', 'education', 'Middle East', 'Egyptian education official',
         'his hand', 'he', 'personal question', 'I', 'Donald Trump', 'we', 'mosques',
         'United States', 'he', 'great sorrow', 'what', 'we', 'our kids']
     observed = [nc.text for nc in extract.noun_chunks(
         self.spacy_doc, drop_determiners=True)]
     self.assertEqual(observed, expected)
Beispiel #8
0
 def test_noun_chunks_determiner(self):
     expected = [
         'I', 'Kuwait', 'an I.M.F. seminar', 'Arab educators', '30 minutes',
         'we', 'the impact', 'technology trends', 'education',
         'the Middle East', 'an Egyptian education official', 'his hand',
         'he', 'a personal question', 'I', 'Donald Trump', 'we', 'mosques',
         'the United States', 'he', 'great sorrow', 'what', 'we', 'our kids'
     ]
     observed = [
         nc.text for nc in extract.noun_chunks(self.spacy_doc,
                                               drop_determiners=False)
     ]
     self.assertEqual(observed, expected)
Beispiel #9
0
    def noun_chunks(self, **kwargs):
        """
        Extract an ordered sequence of noun phrases from doc, optionally
        filtering by frequency and dropping leading determiners.

        Args:
            **kwargs:
                drop_determiners (bool, optional): remove leading determiners (e.g. "the")
                    from phrases (e.g. "the quick brown fox" => "quick brown fox")
                min_freq (int, optional): remove chunks that occur in `doc` fewer than
                    `min_freq` times

        Yields:
            ``spacy.Span``: the next noun chunk, in order of appearance in the document

        .. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>`
        """
        for nc in extract.noun_chunks(self.spacy_doc, **kwargs):
            yield nc
Beispiel #10
0
    def noun_chunks(self, **kwargs):
        """
        Extract an ordered sequence of noun phrases from doc, optionally
        filtering by frequency and dropping leading determiners.

        Args:
            **kwargs:
                drop_determiners (bool, optional): remove leading determiners (e.g. "the")
                    from phrases (e.g. "the quick brown fox" => "quick brown fox")
                min_freq (int, optional): remove chunks that occur in `doc` fewer than
                    `min_freq` times

        Yields:
            ``spacy.Span``: the next noun chunk, in order of appearance in the document

        .. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>`
        """
        for nc in extract.noun_chunks(self.spacy_doc, **kwargs):
            yield nc
Beispiel #11
0
def grammars( carrel, grammar, query, noun, lemma, sort, count ) :

	"""Extract sentence fragments from <carrel> where fragments are one of:
	
	\b
	  nouns - all the nouns and noun chunks
	  quotes - things people say
	  svo - fragments in the form of subject-verb-object (the default)
	  sss - a more advanced version of svo; fragments beginning
	    with an entity, are co-occur with a verb, and are followed
	    by a phrase
	
	This is very useful for the purposes of listing more complete ideas from a text.
	
	Examples:
	
	\b
	  rdr grammars homer
	  rdr grammars -g nouns homer
	  rdr grammars -g sss -n hector -l be homer"""
	
	# require
	from textacy import extract
	from os      import system
	from re      import search
	
	# sanity check
	checkForCarrel( carrel )

	# initialize
	doc = carrel2doc( carrel )

	# get the features; svo
	if grammar == 'svo' :
	
		# do the work
		features = list( extract.subject_verb_object_triples( doc ) )
		
		# simplify the result
		items = []
		for feature in features :
		
			subject = [ token.text_with_ws for token in feature.subject ]
			verb    = [ token.text_with_ws for token in feature.verb ]
			object  = [ token.text_with_ws for token in feature.object ]
			items.append(' \t'.join( [ ''.join( subject ), ''.join( verb ), ''.join( object ) ] ) )

		# done
		features = items
		
	# quotes
	elif grammar == 'quotes' :
	
		# do the work
		features = list( extract.direct_quotations( doc ) )
		
		# simplify the result
		items = []
		for feature in features :
		
			# parse and stringify
			speaker = [ token.text_with_ws for token in feature.speaker ]
			cue     = [ token.text_with_ws for token in feature.cue ]
			content = feature.content.text_with_ws
			items.append( '\t'.join( [ ''.join( speaker ), ''.join( cue ), content ] ) )

		# done
		features = items

	# noun chunks
	elif grammar == 'nouns' :
	
		# do the work and simplify the result
		features = list( extract.noun_chunks( doc ) )
		features = [ feature.text for feature in features ]
		
	# semi-structured sentences
	elif grammar == 'sss' :

		# sanity check
		if not noun :
		
			click.echo( "Error: When specifying sss, the -n option is required. See 'rdr grammars --help'.", err=True )
			exit()
			
		# do the work
		features = list( extract.semistructured_statements( doc, entity=noun, cue=lemma ) )

		# simplify the result
		items = []
		for feature in features :
		
			entity   = [ token.text_with_ws for token in feature.entity ]
			cue      = [ token.text_with_ws for token in feature.cue ]
			fragment = [ token.text_with_ws for token in feature.fragment ]
			items.append( '\t'.join( [ ''.join( entity ), ''.join( cue ), ''.join( fragment ) ] ) )

		# done
		features = items

	# filter, conditionally
	if query : features = [ feature for feature in features if ( search( query, feature ) ) ]
	
	# sort, conditionally
	if sort : features.sort()
	
	# count, conditionally
	if count :
	
		# initialize a dictionary and process each feature
		items = {}
		for feature in features :

			# update the dictionary
			if feature in items : items[ feature ] += 1
			else                : items[ feature ]  = 1

		# sort the dictionary; return the features
		features = sorted( items.items(), key=lambda x:x[ 1 ], reverse=True )
		
		# process each feature, again
		items = []
		for feature in features :
			
			# create a record and update
			record = str( feature[ 1 ] ) + '\t' + feature[ 0 ]
			items.append( record )
		
		# done
		features = items
	
	# output
	for feature in features : click.echo( feature )
Beispiel #12
0
 def test_determiner(self, spacy_doc):
     result = list(extract.noun_chunks(spacy_doc, drop_determiners=False))
     assert all(isinstance(span, Span) for span in result)
     assert any(span[0].pos_ == "DET" for span in result)
Beispiel #13
0
 def test_default(self, spacy_doc):
     result = list(extract.noun_chunks(spacy_doc))
     assert all(isinstance(span, Span) for span in result)
Beispiel #14
0
def extract_noun_chunks(doc, min_freq=1):
    return extract.noun_chunks(doc, drop_determiners=True, min_freq=min_freq)
Beispiel #15
0
 def test_min_freq(self, spacy_doc):
     text = spacy_doc.text.lower()
     result = list(extract.noun_chunks(spacy_doc, drop_determiners=True, min_freq=2))
     assert all(text.count(span.lower_) >= 2 for span in result)
Beispiel #16
0
 def test_noun_chunks_min_freq(self):
     expected = ['I', 'we', 'he', 'I', 'we', 'he', 'we']
     observed = [nc.text for nc in extract.noun_chunks(
         self.spacy_doc, drop_determiners=True, min_freq=2)]
     self.assertEqual(observed, expected)
Beispiel #17
0
def noun_chunks(s):
    s = unicode(s)
    doc = nlp(s)
    return list(extr.noun_chunks(doc))