def test_document(self): # Assert Document properties. # Test with different input types. for constructor, w in ((vector.Document, "The cats sat on the mat."), (vector.Document, ["The", "cats", "sat", "on", "the", "mat"]), (vector.Document, { "cat": 1, "sat": 1, "mat": 1 }), (vector.Document, Text(parse("The cats sat on the mat."))), (vector.Document, Sentence(parse("The cats sat on the mat."))), (vector.Document.open, self.path1)): # Test copy. v = constructor(w, stemmer=vector.LEMMA, stopwords=False, name="Cat", type="CAT") v = v.copy() # Test properties. self.assertEqual(v.name, "Cat") self.assertEqual(v.type, "CAT") self.assertEqual(v.count, 3) self.assertEqual(v.terms, {"cat": 1, "sat": 1, "mat": 1}) # Test iterator decoration. self.assertEqual(sorted(v.features), ["cat", "mat", "sat"]) self.assertEqual(sorted(v), ["cat", "mat", "sat"]) self.assertEqual(len(v), 3) self.assertEqual(v["cat"], 1) self.assertEqual("cat" in v, True) print "pattern.vector.Document"
def analyze(self): text = Text(self.clean_text) for sentence in text: print '-----SENTENCE' + str(sentence.string) print sentence.subjects print sentence.verbs for word in sentence: print word.string print word.lemma print word.type print word.chunk
def convert_pattern_format(text): """ Text is parsed through pattern's parsing function into a standardized format. """ parsed_text = [] # parse text via Pattern's parser pattern_parsed_text = Text(parse(text, relations=True, lemmata=True)) for sentence in pattern_parsed_text: s = Sentence() s.string = remove_blanks(sentence.string) for word in sentence: # Patterns tags for each word in the sentence are stored in a new Word-object w = Word() w.string = word.string w.lemma = word.lemma w.index = word.index w.tag = word.type w.entity = "" # each word is appended to a Sentence-object s.words.append(w) # each Sentence-object is appended to an array parsed_text.append(s) return parsed_text
import os, sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.en import parse, Text # The easiest way to analyze the output of the parser is to create a Text. # A Text is a "parse tree" of linked Python objects. # A Text is essentially a list of Sentence objects. # Each Sentence is a list of Word objects. # Each Word can be part of a Chunk object, accessible with Word.chunk. s = "I eat pizza with a silver fork." s = parse(s) s = Text(s) # You can also use the parsetree() function, # which is the equivalent of Text(parse()). print(s[0].words) # A list of all the words in the first sentence. print(s[0].chunks) # A list of all the chunks in the first sentence. print(s[0].chunks[-1].words) print("") for sentence in s: for word in sentence: print(word.string, word.type, word.chunk, word.pnp) # A Text can be exported as an XML-string (among other). print("") print(s.xml)
def parse(self, text): self.valid_sentences = [] self.faulty_sentences = [] self.statistic.properties['valid_sentences'] = 0 self.statistic.properties['too_few_words'] = 0 self.statistic.properties['first_word_is_number'] = 0 self.statistic.properties['sentence_contains_brackets'] = 0 self.statistic.properties['sentence_contains_number'] = 0 self.statistic.properties['sentence_too_many_comma'] = 0 self.statistic.properties['too_many_short_words'] = 0 self.statistic.properties['sentence_in_german'] = 0 self.statistic.properties['sentence_in_french'] = 0 self.statistic.properties['sentence_in_spanish'] = 0 self.statistic.properties['sentence_in_italian'] = 0 self.statistic.properties['sentence_in_dutch'] = 0 self.statistic.properties['sentence_not_english'] = 0 self.statistic.properties['begins_with_punctuation'] = 0 self.statistic.properties['weird_chars'] = 0 self.statistic.properties['first_not_upper'] = 0 self.statistic.properties['last_not_dot'] = 0 self.statistic.properties['too_many_dots'] = 0 text = self.replace(text) text = Text( parse(text, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding='utf-8', tagset=None)) for sentence in text: replaced_string = self.replace(sentence.string) if len(sentence.words) < self.MIN_WORD_COUNT: # too few words in the sentence # removes sentences like these: https://gist.github.com/mrzl/32b9763bd943c18cb77cd1167a87640a self.statistic.properties['too_few_words'] += 1 self.faulty_sentences.append(sentence) continue if self.get_perc_single_char_words(sentence) > 0.5: # too many short words self.statistic.properties['too_many_short_words'] += 1 self.faulty_sentences.append(sentence) continue if replaced_string[0].isdigit(): # first word of the sentence is a number self.statistic.properties['first_word_is_number'] += 1 self.faulty_sentences.append(sentence) continue if self.contains_tag(sentence, "(", 3) \ or self.contains_tag(sentence, ")", 3): # the sentence contains either ( ) self.statistic.properties['sentence_contains_brackets'] += 1 self.faulty_sentences.append(sentence) continue if self.get_count_tag(sentence, "CD") > 3: # the sentence contains a number self.statistic.properties['sentence_contains_number'] += 1 self.faulty_sentences.append(sentence) continue if self.get_count_tag(sentence, ",") > 6: # the sentence has more than 2 occurrences of a comma(,) self.statistic.properties['sentence_too_many_comma'] += 1 self.faulty_sentences.append(sentence) continue if replaced_string[0] in [ u'.', u'\'', u';', u'~', u':', u'-', u'·', u'‘', u'’', u'\\' ]: # sentence begins with punctuations self.statistic.properties['begins_with_punctuation'] += 1 self.faulty_sentences.append(sentence) continue if self.get_count_of_special_chars(sentence) > 3: # sentence contains weirdly escaped chars self.statistic.properties['weird_chars'] += 1 self.faulty_sentences.append(sentence) continue if not replaced_string[0].isupper(): # first char is not upper case self.statistic.properties['first_not_upper'] += 1 self.faulty_sentences.append(sentence) continue if not sentence.string[-1] in ['.', '?', '!']: # last char not a dot self.statistic.properties['last_not_dot'] += 1 self.faulty_sentences.append(sentence) continue if sentence.string.count('.') > 3: # too many dots in the sentence self.statistic.properties['too_many_dots'] += 1 self.faulty_sentences.append(sentence) continue try: classified = langid.classify(sentence.string)[0] if classified in 'en': self.valid_sentences.append(sentence) continue elif classified in 'de': self.statistic.properties['sentence_in_german'] += 1 self.faulty_sentences.append(sentence) continue elif classified in 'fr': self.statistic.properties['sentence_in_french'] += 1 self.faulty_sentences.append(sentence) continue elif classified in 'es': self.statistic.properties['sentence_in_spanish'] += 1 self.faulty_sentences.append(sentence) continue elif classified in 'nl': self.statistic.properties['sentence_in_dutch'] += 1 self.faulty_sentences.append(sentence) continue elif classified in 'it': self.statistic.properties['sentence_in_italian'] += 1 self.faulty_sentences.append(sentence) continue else: self.statistic.properties['sentence_not_english'] += 1 self.faulty_sentences.append(sentence) continue except: self.statistic.properties['sentence_not_english'] += 1 self.faulty_sentences.append(sentence) self.statistic.properties['valid_sentences'] += len( self.valid_sentences) print('Parsed ' + str(len(self.valid_sentences)) + ' proper sentences.') sum = 0 for key, value in self.statistic.properties.items(): if key not in 'valid_sentences': sum += value print('Discarded ' + str(sum) + ' invalid sentences.')
tokenize=True, # Split punctuation marks from words? tags=True, # Parse part-of-speech tags? (NN, JJ, ...) chunks=True, # Parse chunks? (NP, VP, PNP, ...) relations=False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata=False, # Parse lemmata? (ate => eat) encoding='utf-8', # Input string encoding. tagset=None) # Penn Treebank II (default) or UNIVERSAL. print repr(s) for sentence in s: for chunk in sentence.chunks: print chunk.type, [(w.string, w.type) for w in chunk.words] for sentence in tree(open('data/input/tagged.txt'), token=[WORD, POS, CHUNK]): # CHECK FOR ERROR print sentence # text text = Text(open('data/input/corpus.txt'), token=[WORD, POS, CHUNK, PNP, REL, LEMMA]) # text = Text.from_xml('data/input/multilingual-all-words.en.xml') # Reads an XML string generated with Text.xml. print text.string # 'The cat sat on the mat .' print text.sentences # [Sentence('The cat sat on the mat .')] print text.copy() print text.xml # sentence # sentence = Sentence(open('data/input/corpus.txt'), token=[WORD, POS, CHUNK, PNP, REL, LEMMA]) # sentence = Sentence.from_xml(xml) print sentence.parent # Sentence parent, or None. print sentence.id # Unique id for each sentence. print sentence.start # 0 print sentence.stop # len(Sentence). print sentence.string # Tokenized string, without tags. print sentence.words # List of Word objects. print sentence.lemmata # List of word lemmata.
def __init__(self,data): # Don't convert to lower string as CAPS are important too. But Implement lower if isinstance(data,basestring): self.DATA = data self.PARSE = parse(self.DATA) self.DOC = Text(self.PARSE)