Ejemplo n.º 1
0
 def test_document(self):
     # Assert Document properties.
     # Test with different input types.
     for constructor, w in ((vector.Document, "The cats sat on the mat."),
                            (vector.Document,
                             ["The", "cats", "sat", "on", "the",
                              "mat"]), (vector.Document, {
                                  "cat": 1,
                                  "sat": 1,
                                  "mat": 1
                              }), (vector.Document,
                                   Text(parse("The cats sat on the mat."))),
                            (vector.Document,
                             Sentence(parse("The cats sat on the mat."))),
                            (vector.Document.open, self.path1)):
         # Test copy.
         v = constructor(w,
                         stemmer=vector.LEMMA,
                         stopwords=False,
                         name="Cat",
                         type="CAT")
         v = v.copy()
         # Test properties.
         self.assertEqual(v.name, "Cat")
         self.assertEqual(v.type, "CAT")
         self.assertEqual(v.count, 3)
         self.assertEqual(v.terms, {"cat": 1, "sat": 1, "mat": 1})
         # Test iterator decoration.
         self.assertEqual(sorted(v.features), ["cat", "mat", "sat"])
         self.assertEqual(sorted(v), ["cat", "mat", "sat"])
         self.assertEqual(len(v), 3)
         self.assertEqual(v["cat"], 1)
         self.assertEqual("cat" in v, True)
     print "pattern.vector.Document"
 def analyze(self):
     text = Text(self.clean_text)
     for sentence in text:
         print '-----SENTENCE' + str(sentence.string)
         print sentence.subjects
         print sentence.verbs
         for word in sentence:
             print word.string
             print word.lemma
             print word.type
             print word.chunk
Ejemplo n.º 3
0
def convert_pattern_format(text):
    """
    Text is parsed through pattern's parsing function into a standardized format.
    """
    parsed_text = []
    # parse text via Pattern's parser
    pattern_parsed_text = Text(parse(text, relations=True, lemmata=True))
    for sentence in pattern_parsed_text:
        s = Sentence()
        s.string = remove_blanks(sentence.string)
        for word in sentence:
            # Patterns tags for each word in the sentence are stored in a new Word-object
            w = Word()
            w.string = word.string
            w.lemma = word.lemma
            w.index = word.index
            w.tag = word.type
            w.entity = ""
            # each word is appended to a Sentence-object
            s.words.append(w)
        # each Sentence-object is appended to an array
        parsed_text.append(s)
    return parsed_text
Ejemplo n.º 4
0
import os, sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.en import parse, Text

# The easiest way to analyze the output of the parser is to create a Text.
# A Text is a "parse tree" of linked Python objects.
# A Text is essentially a list of Sentence objects.
# Each Sentence is a list of Word objects.
# Each Word can be part of a Chunk object, accessible with Word.chunk.
s = "I eat pizza with a silver fork."
s = parse(s)
s = Text(s)

# You can also use the parsetree() function,
# which is the equivalent of Text(parse()).

print(s[0].words)  # A list of all the words in the first sentence.
print(s[0].chunks)  # A list of all the chunks in the first sentence.
print(s[0].chunks[-1].words)
print("")

for sentence in s:
    for word in sentence:
        print(word.string, word.type, word.chunk, word.pnp)

# A Text can be exported as an XML-string (among other).
print("")
print(s.xml)
Ejemplo n.º 5
0
    def parse(self, text):
        self.valid_sentences = []
        self.faulty_sentences = []

        self.statistic.properties['valid_sentences'] = 0
        self.statistic.properties['too_few_words'] = 0
        self.statistic.properties['first_word_is_number'] = 0
        self.statistic.properties['sentence_contains_brackets'] = 0
        self.statistic.properties['sentence_contains_number'] = 0
        self.statistic.properties['sentence_too_many_comma'] = 0
        self.statistic.properties['too_many_short_words'] = 0
        self.statistic.properties['sentence_in_german'] = 0
        self.statistic.properties['sentence_in_french'] = 0
        self.statistic.properties['sentence_in_spanish'] = 0
        self.statistic.properties['sentence_in_italian'] = 0
        self.statistic.properties['sentence_in_dutch'] = 0
        self.statistic.properties['sentence_not_english'] = 0
        self.statistic.properties['begins_with_punctuation'] = 0
        self.statistic.properties['weird_chars'] = 0
        self.statistic.properties['first_not_upper'] = 0
        self.statistic.properties['last_not_dot'] = 0
        self.statistic.properties['too_many_dots'] = 0

        text = self.replace(text)

        text = Text(
            parse(text,
                  tokenize=True,
                  tags=True,
                  chunks=True,
                  relations=False,
                  lemmata=False,
                  encoding='utf-8',
                  tagset=None))

        for sentence in text:

            replaced_string = self.replace(sentence.string)
            if len(sentence.words) < self.MIN_WORD_COUNT:
                # too few words in the sentence
                # removes sentences like these: https://gist.github.com/mrzl/32b9763bd943c18cb77cd1167a87640a
                self.statistic.properties['too_few_words'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if self.get_perc_single_char_words(sentence) > 0.5:
                # too many short words
                self.statistic.properties['too_many_short_words'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if replaced_string[0].isdigit():
                # first word of the sentence is a number
                self.statistic.properties['first_word_is_number'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if self.contains_tag(sentence, "(", 3) \
                    or self.contains_tag(sentence, ")", 3):
                # the sentence contains either ( )
                self.statistic.properties['sentence_contains_brackets'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if self.get_count_tag(sentence, "CD") > 3:
                # the sentence contains a number
                self.statistic.properties['sentence_contains_number'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if self.get_count_tag(sentence, ",") > 6:
                # the sentence has more than 2 occurrences of a comma(,)
                self.statistic.properties['sentence_too_many_comma'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if replaced_string[0] in [
                    u'.', u'\'', u';', u'~', u':', u'-', u'·', u'‘', u'’',
                    u'\\'
            ]:
                # sentence begins with punctuations
                self.statistic.properties['begins_with_punctuation'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if self.get_count_of_special_chars(sentence) > 3:
                # sentence contains weirdly escaped chars
                self.statistic.properties['weird_chars'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if not replaced_string[0].isupper():
                # first char is not upper case
                self.statistic.properties['first_not_upper'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if not sentence.string[-1] in ['.', '?', '!']:
                # last char not a dot
                self.statistic.properties['last_not_dot'] += 1
                self.faulty_sentences.append(sentence)
                continue
            if sentence.string.count('.') > 3:
                # too many dots in the sentence
                self.statistic.properties['too_many_dots'] += 1
                self.faulty_sentences.append(sentence)
                continue
            try:
                classified = langid.classify(sentence.string)[0]
                if classified in 'en':
                    self.valid_sentences.append(sentence)
                    continue
                elif classified in 'de':
                    self.statistic.properties['sentence_in_german'] += 1
                    self.faulty_sentences.append(sentence)
                    continue
                elif classified in 'fr':
                    self.statistic.properties['sentence_in_french'] += 1
                    self.faulty_sentences.append(sentence)
                    continue
                elif classified in 'es':
                    self.statistic.properties['sentence_in_spanish'] += 1
                    self.faulty_sentences.append(sentence)
                    continue
                elif classified in 'nl':
                    self.statistic.properties['sentence_in_dutch'] += 1
                    self.faulty_sentences.append(sentence)
                    continue
                elif classified in 'it':
                    self.statistic.properties['sentence_in_italian'] += 1
                    self.faulty_sentences.append(sentence)
                    continue
                else:
                    self.statistic.properties['sentence_not_english'] += 1
                    self.faulty_sentences.append(sentence)
                    continue
            except:
                self.statistic.properties['sentence_not_english'] += 1
                self.faulty_sentences.append(sentence)

        self.statistic.properties['valid_sentences'] += len(
            self.valid_sentences)
        print('Parsed ' + str(len(self.valid_sentences)) +
              ' proper sentences.')

        sum = 0
        for key, value in self.statistic.properties.items():
            if key not in 'valid_sentences':
                sum += value

        print('Discarded ' + str(sum) + ' invalid sentences.')
Ejemplo n.º 6
0
    tokenize=True,  # Split punctuation marks from words?
    tags=True,  # Parse part-of-speech tags? (NN, JJ, ...)
    chunks=True,  # Parse chunks? (NP, VP, PNP, ...)
    relations=False,  # Parse chunk relations? (-SBJ, -OBJ, ...)
    lemmata=False,  # Parse lemmata? (ate => eat)
    encoding='utf-8',  # Input string encoding.
    tagset=None)  # Penn Treebank II (default) or UNIVERSAL.
print repr(s)
for sentence in s:
    for chunk in sentence.chunks:
        print chunk.type, [(w.string, w.type) for w in chunk.words]
for sentence in tree(open('data/input/tagged.txt'),
                     token=[WORD, POS, CHUNK]):  # CHECK FOR ERROR
    print sentence
# text
text = Text(open('data/input/corpus.txt'),
            token=[WORD, POS, CHUNK, PNP, REL, LEMMA])
# text = Text.from_xml('data/input/multilingual-all-words.en.xml')  # Reads an XML string generated with Text.xml.
print text.string  # 'The cat sat on the mat .'
print text.sentences  # [Sentence('The cat sat on the mat .')]
print text.copy()
print text.xml
# sentence
# sentence = Sentence(open('data/input/corpus.txt'), token=[WORD, POS, CHUNK, PNP, REL, LEMMA])
# sentence = Sentence.from_xml(xml)
print sentence.parent  # Sentence parent, or None.
print sentence.id  # Unique id for each sentence.
print sentence.start  # 0
print sentence.stop  # len(Sentence).
print sentence.string  # Tokenized string, without tags.
print sentence.words  # List of Word objects.
print sentence.lemmata  # List of word lemmata.
Ejemplo n.º 7
0
    def __init__(self,data):
#        Don't convert to lower string as CAPS are important too. But Implement lower
        if isinstance(data,basestring):
            self.DATA = data
            self.PARSE = parse(self.DATA)
            self.DOC = Text(self.PARSE)