Ejemplo n.º 1
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "der große Hund" is a noun phrase, "auf der Matte" is a prepositional noun phrase.
     v = de.parser.parse(u"Der große Hund sitzt auf der Matte.")
     self.assertEqual(v,
         u"Der/DT/B-NP/O große/JJ/I-NP/O Hund/NN/I-NP/O " + \
         u"sitzt/VB/B-VP/O " + \
         u"auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O"
     )
     # 2) "große" and "sitzt" lemmata are "groß" and "sitzen".
     # Note how articles are problematic ("der" can be male subject but also plural possessive).
     v = de.parser.parse(u"Der große Hund sitzt auf der Matte.",
                         lemmata=True)
     self.assertEqual(v,
         u"Der/DT/B-NP/O/der große/JJ/I-NP/O/groß Hund/NN/I-NP/O/hund " + \
         u"sitzt/VB/B-VP/O/sitzen " + \
         u"auf/IN/B-PP/B-PNP/auf der/DT/B-NP/I-PNP/der Matte/NN/I-NP/I-PNP/matte ././O/O/."
     )
     # 3) Assert the accuracy of the German tagger.
     i, n = 0, 0
     for sentence in open(
             os.path.join(PATH, "corpora",
                          "tagged-de-tiger.txt")).readlines():
         sentence = sentence.decode("utf-8").strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s1 = [de.stts2penntreebank(w, pos) for w, pos in s1]
         s2 = [[w for w, pos in s1]]
         s2 = de.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             if s1[j][1] == s2[j][1]:
                 i += 1
             n += 1
     self.assertTrue(float(i) / n > 0.844)
     print("pattern.de.parse()")
Ejemplo n.º 2
0
def process_text( text ):
    annotated = []
    parsed_text = parse(text, lemmata=True)
    doc = parsed_text.split(" ")
    for token in doc:
        pos_tag = token.split("/")[1]
        lemma = token.split("/")[4]
        if pos_tag == ".":
            continue
        current_token = token.split("/")[0]

        if current_token not in spell:
            current_token = spell.correction(current_token)

        if pos_tag[0] == "N":
            current_token = current_token[0].upper()+current_token[1:]
        else:
            current_token = current_token[0].lower()+current_token[1:]

        annotated.append(current_token)

        synonyms = get_synonyms(current_token)
        for synonym in synonyms:
             annotated.append(synonym)

        if lemma.lower() != current_token.lower():
             annotated.append(lemma)

    text = ' '.join(annotated)
    return text
Ejemplo n.º 3
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "der große Hund" is a noun phrase, "auf der Matte" is a prepositional noun phrase.
     v = de.parser.parse(u"Der große Hund sitzt auf der Matte.")
     self.assertEqual(
         v,
         u"Der/DT/B-NP/O große/JJ/I-NP/O Hund/NN/I-NP/O "
         + u"sitzt/VB/B-VP/O "
         + u"auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O",
     )
     # 2) "große" and "sitzt" lemmata are "groß" and "sitzen".
     # Note how articles are problematic ("der" can be male subject but also plural possessive).
     v = de.parser.parse(u"Der große Hund sitzt auf der Matte.", lemmata=True)
     self.assertEqual(
         v,
         u"Der/DT/B-NP/O/der große/JJ/I-NP/O/groß Hund/NN/I-NP/O/hund "
         + u"sitzt/VB/B-VP/O/sitzen "
         + u"auf/IN/B-PP/B-PNP/auf der/DT/B-NP/I-PNP/der Matte/NN/I-NP/I-PNP/matte ././O/O/.",
     )
     # 3) Assert the accuracy of the German tagger.
     i, n = 0, 0
     for sentence in open(os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines():
         sentence = sentence.decode("utf-8").strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s1 = [de.stts2penntreebank(w, pos) for w, pos in s1]
         s2 = [[w for w, pos in s1]]
         s2 = de.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             if s1[j][1] == s2[j][1]:
                 i += 1
             n += 1
     self.assertTrue(float(i) / n > 0.844)
     print "pattern.de.parse()"
Ejemplo n.º 4
0
 def extract_lang_features(self, utterance, embeddings, word_id):
     shape = (64,)
     word_embeddings = np.zeros(shape)
     token_appearance = dict()
     if '@' in utterance:
         utterance = self.delete_username(utterance)
     if self.has_link(utterance):
         utterance = self.delete_link(utterance)
     utterance = self.delete_non_alphabetic_symbols(utterance)
     sentences = parse(utterance, relations=True, lemmata=True).split()
     # tokens = utterance.split(' ')
     token_number = 1
     for sentence in sentences:
         token_number += len(sentence)
         for token in sentence:
             if token[5] in token_appearance:
                 token_appearance[token[5]] += 1
             else:
                 token_appearance[token[5]] = 1
             embedding = words2vec.find_word_embeddings(token[0], embeddings, word_id)
             if embedding is not None:
                 word_embeddings = np.add(word_embeddings, embedding)
                 # embedding = np.zeros(64)
             # embeddings_list = np.append(embeddings_list, embedding, axis=0)
             # embeddings_list.append(embedding)
     if token_number > 1:
         token_number = token_number - 1
     word_embeddings = np.divide(word_embeddings, token_number)
     return word_embeddings, token_appearance
Ejemplo n.º 5
0
def pos_clean(x):
    from pattern.de import parse
    s = parse(x, chunks=True, tagset="STTS", relations=True,
              lemmata=True).split()[0]
    sen = []
    for i in s:
        if i[1] == 'NN' or i[1] == 'ADJA' or i[1] == 'FM' or i[
                1] == 'ADJD' or i[1] == 'APPRART':
            sen.append(i[0])
    return ' '.join(sen)
Ejemplo n.º 6
0
def test():
    platforms = load_platforms()
    for party, sections in platforms.items():
        for section in sections:
            tagged = split(parse(section.text))
            for sentence in tagged:
                #if not sentence.is_question:
                #    continue
                try:
#                    for word in sentence.words:
#                        print word.tags #dir(word)
                    #print [sentence.subjects, sentence.verbs]
                    #print [sentence.is_question]
                    #print [sentence.words]
                    print [sentence.text]
                except UnicodeEncodeError:
                    pass
Ejemplo n.º 7
0
def _getParse(word, language):
    import pattern.en as pattern_en  # @UnresolvedImport
    import pattern.es as pattern_es  # @UnresolvedImport
    import pattern.fr as pattern_fr  # @UnresolvedImport
    import pattern.de as pattern_de  # @UnresolvedImport
    import pattern.it as pattern_it  # @UnresolvedImport

    if language == "es":
        return pattern_es.parse(word)
    elif language == "en":
        return pattern_en.parse(word)
    elif language == "it":
        return pattern_it.parse(word)
    elif language == "fr":
        return pattern_fr.parse(word)
    elif language == "de":
        return pattern_de.parse(word)
    else:
        return pattern_en.parse(word)
Ejemplo n.º 8
0
 def is_first_verb(utterance):
     is_verb = False
     is_imperativ = False
     if '@' in utterance:
         utterance = Feature.delete_username(utterance)
     utterance = Feature.delete_conjuction(utterance)
     sentences = parse(utterance, relations=True, lemmata=True, tagset='STTS').split()
     pos_list = [ 'VVFIN','VAFIN', 'VVINF', 'VAINF', 'VVIZU', 'VVIMP', 'VAIMP', 'VVPP', 'VAPP']
     pos_imp = ['VVIMP', 'VAIMP']
     # a = mood(utterance)
     # print a
     if len(sentences) != 0:
         if len(sentences[0]) != 0:
             pos = sentences[0][0][1]
             if pos in pos_list:
                 is_verb = True
                 if pos in pos_imp:
                     is_imperativ = True
                 return is_verb, is_imperativ
     return is_verb, is_imperativ
Ejemplo n.º 9
0
def main():
    args = parse_args()
    words = []
    print("Loading from {}".format(args.input))
    with io.open(args.input, encoding='utf8') as f:
        acceptable_characters = string.letters + string.digits + " äüö"
        for line in f.readlines():
            if line.strip() == "suggestterm":
                continue
            word = filter(lambda c: c in acceptable_characters, line).strip()
            if len(word) > 0 and not any(c.isdigit() for c in word):
                words.append(word)

    print("Parsing {} words".format(len(words)))
    parsed_words = []
    for f in words:
        parsed = parse(f,
                       tags=False,
                       chunks=False,
                       relations=False,
                       lemmata=True)
        parsedlist = u" ".join(
            [word.split("/")[2] for word in parsed.split(" ")])
        parsed_words.append(parsedlist)

    print("Saving {} words to {}".format(len(parsed_words), args.output))
    with open(args.output, "w") as f:
        for word in parsed_words:
            print(clean_umlauts(word), file=f)

    single_word_entries = list(
        {clean_umlauts(f.strip())
         for f in parsed_words if " " not in f})
    single_word_entries.sort()
    print("Saving {} words to {}".format(len(single_word_entries),
                                         args.singleoutput))
    with open(args.singleoutput, "w") as f:
        for word in single_word_entries:
            print(word, file=f)
Ejemplo n.º 10
0
#!/usr/bin/env python2
# coding: utf-8

import sys
sys.path.insert(0, '/zen/tez/pattern-2.6/build/lib')
from pattern.de import parse

parse_sent = lambda sent: parse(
    sent, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)

#if __name__ == '__main__':
#    while True:
line = sys.stdin.readline()
sent = line.strip().decode('utf-8')
if sent == '000':
    sys.exit(0)
ps = parse_sent(sent).encode('utf-8') + "\n"
sys.stdout.write(ps)
Ejemplo n.º 11
0
# -*- coding: utf-8 -*-

from pattern.de import parse, split, pprint, tag
# from pprint import pprint

# s = parse('Die Katze liegt auf der Matte.')
# for sentence in split(s):
#     for word in sentence:
#         print(word)
#     pprint(sentence)

pprint(
    parse('Die Katze liegt auf der Matte mit weniger als 10%.',
          tags=True,
          chunks=True,
          relations=True,
          lemmata=True,
          encoding='utf-8',
          tagset="STTS"))

for word, pos in tag('Die Katze liegt auf der Matte mit weniger als 10%.',
                     tagset="STTS"):
    if pos == "ARTDEF" or pos == "NN":
        print word + '\t' + pos
Ejemplo n.º 12
0
from pattern.de import parse, parsetree, split
"""
import pattern.de
pattern.de.verbs - 1962 verbs. 
pattern.de.tenses 
pattern.de.tenses('erblicken')
pattern.de.conjugate.__doc__


"""
"""
lst=parse(raw)
(Pdb) split(lst)[0]
Sentence('Stehen/VB/B-VP/O bleiben/VB/I-VP/O !/./O/O')
"""
lst = parse(raw)
for sent in split(lst):
    print "sent.string: ", sent.string

pdb.set_trace()

sys.exit(2)
s = parsetree(raw)
print "sentences now"
for sentence in s:
    print "sentence: ", sentence

    for chunk in sentence.chunks:
        print "sentence: ", " ".join([w.string for w in chunk.words])
        #print "\tchunk type: ", chunk.type, [(w.string, w.type) for w in chunk.words]
        pdb.set_trace()
      elif len(field[2]) != 3:
         print line.encode('utf8'),
         continue

# F. Get the text, clean leading chevrons, and print the line
      try:
         text = re.sub('^[>,\ ]{0,6}', '', field[3])
      except IndexError:
         print line.encode('utf8')
         continue
      print line.encode('utf8'),
      snt = ""

# G. Clean ups
      text = re.sub('Mind\.', 'Mindestens', text)

# H. Pattern 2.6 parts of speech -- split the text if needed
      try:
         pos = parse(text, lemmata=True, relations=True, encoding = 'utf-8')
         for pos in pos.splitlines():
             pos = re.sub('\ ', '|', pos)
             print u"".join([field[0],"|",field[1],"|POS_03|",pos]).encode('utf-8').strip()
      except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError):
         # Tag failed lines NA to enable repair
         print "".join([field[0],"|",field[1],"|POS_03","|NA"])

# I. Close the file
fp.close()

# EOF
def evaluate():
    global graphs, sentences, pageNumbers, probs, graphs
    # remove old evaluation
    del sentences[:]
    del graphs[:]
    del tagged[:]
    del tagSentences[:]
    del probs[:]
    # read from GUI
    fname = tDocument.get("1.0", END).rstrip('\n')
    # read text from file
    with open(fname, 'r') as f:
        text = f.read()
    f.close()
    #f = codecs.open(fname, encoding='utf-8')
    #text = f.read()
    size = len(text)
    print(("Text with " + str(size) + " characters loaded!"))
    #print(repr(text))
    text = repr(text)
    #print(text)
    text = text.replace('\\x0c', '\\n\\x3Cnewpage\\x3E\\n\\n')
    text = eval(text)
    # mark pagenumbers
    text = re.sub(r'\n([0-9]+)\n+<newpage>', r'<pagebreak>\1<pagebreak>', text)
    #print(text)
    # replace whitespaces by spaces
    text = " ".join(text.split())
    # replace abbr.
    text = text.replace('eg.', 'eg')  # TODO: Problem here!
    text = text.replace('Dr.', 'Dr')
    text = text.replace('Prof.', 'Prof')
    text = text.replace('bzw.', 'bzw')
    text = text.replace('Vgl.', 'vgl')
    text = text.replace('vgl.', 'vgl')
    text = text.replace('etc.', 'etc')
    text = text.replace('Abb.', 'Abbildung')
    text = text.replace('z. B.', 'zum Beispiel')
    text = text.replace('ca.', 'cirka')
    text = text.replace('Nr.', 'Nr')
    text = text.replace('nr.', 'nr')
    text = text.replace('Bg.', 'Bg')
    text = text.replace('al.', 'al')
    text = text.replace('europ.', 'europ')
    text = re.sub(" [a-zA-Z]\.", "", text)
    middle_abbr = re.compile('[A-Za-z0-9]\.[A-Za-z0-9]\.')  # middle abb
    a = middle_abbr.search(text)  # find the abbreviation
    b = re.compile('\.')  # period pattern
    c = b.sub('', a.group(0))  # remove periods from abbreviation
    text = middle_abbr.sub(c, text)  # substitute new abbr for old
    # extract sentences
    pat = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M)
    sentences = pat.findall(text)
    # TODO: remove all very short sentences
    sentences = [elem for elem in sentences if not (calcWords(elem) < 4)]
    # remove all setences that contain a number
    sentences = [elem for elem in sentences if not (hasNumbers(elem))]
    # remove sentences with URL
    sentences = [elem for elem in sentences if not (hasURL(elem))]
    # remove sentences with Noise
    sentences = [elem for elem in sentences if not (hasNoise(elem))]
    # add pagenumbers to sentences
    subset = text
    del pageNumbers[:]
    for sentence in sentences:
        start = subset.find(sentence)
        subset = subset[start:]
        num = re.search("<pagebreak>[0-9]+<pagebreak>", subset).group()
        num = num.replace("<pagebreak>", "")
        pageNumbers.append(num)
    # print all sentences
    for i in range(0, len(sentences)):
        print(("Sentence Nr. " + str(i) + ", Page Nr. " + str(pageNumbers[i]) +
               ": " + sentences[i] + "\n"))
    # create pos
    for sentence in sentences:
        tagged.append(parse(sentence))
    # print all tagged sentences
    for i in range(0, len(sentences)):
        print(("Tagged Sentence Nr. " + str(i) + ", Page Nr. " +
               str(pageNumbers[i]) + ": " + tagged[i] + "\n"))
    # cut words, keep tags, simplify them
    for sentence in tagged:
        fragment = sentence.split(' ')
        tags = []
        for tag in fragment:
            fragment = tag.split('/')[1]
            # simplify tags
            fragment = fragment.replace(".", ".X")
            fragment = fragment.replace(",", ",X")
            fragment = fragment.replace(":", ":X")
            fragment = fragment.replace("(", ")X")
            fragment = fragment.replace("(", ")X")
            fragment = fragment = fragment[0:2]
            tags.append(fragment)
        tagSentences.append(tags)
    # print tags only
    for i in range(0, len(sentences)):
        print(("Sentence Nr. " + str(i) + ", Page Nr. " + str(pageNumbers[i]) +
               ": " + str(tagSentences[i]) + "\n"))
    print("Processing done")
    # create transitions for each sentence
    for sentence in tagSentences:
        print(sentence)
        print((sentence[0]))
        # first transition
        trans = []
        trans.append("ST" + str(sentence[0]))  # mark start
        for i in range(1, len(sentence) - 1):
            trans.append(str(sentence[i]) + str(sentence[i + 1]))
        # last transition
        trans.append(str(sentence[i]) + "EN")  # mark end
        #trans = []
        #for i in range(0, len(sentence)):
        #trans.append(str(i) + sentence[i])
        transitions.append(trans)
    print(transitions)
    # Get total probabilities
    allTrans = []
    total = 0
    for trans in transitions:
        for x in trans:
            allTrans.append(x)
            total += 1
    counts = Counter(allTrans)
    print(counts)
    # get first part of transition probabilities (for cond. prob.)
    condTrans = []
    for x in allTrans:
        condTrans.append(x[0:2])
    countsCond = Counter(condTrans)
    print(countsCond)
    # Get average probability of transitions in a sentence
    for trans in transitions:
        size = len(trans)
        score = 1
        #for x in trans:
        for x in range(0, 1):  # len(trans)):
            pTrans = counts[trans[x]]  # count of transition
            pFirst = countsCond[trans[x][0:2]]  # count of first part of trans
            #print(pTrans, pFirst)
            score *= pTrans / float(pFirst)
        probs.append(score)  # / float(size))
    print(probs)
Ejemplo n.º 15
0
def parse_text(text):
    """ takes german text, 1 or more sentences and applies part of speech information
    """
    # STTS works better than standard tagset. The target words are NN and NE
    return parse(text, tagset="STTS")
Ejemplo n.º 16
0
            continue

# F. Get the text, clean leading chevrons, and print the line
        try:
            text = re.sub('^[>,\ ]{0,6}', '', field[3])
        except IndexError:
            print line.encode('utf8')
            continue
        print line.encode('utf8'),
        snt = ""

        # G. Clean ups
        text = re.sub('Mind\.', 'Mindestens', text)

        # H. Pattern 2.6 parts of speech -- split the text if needed
        try:
            pos = parse(text, lemmata=True, relations=True, encoding='utf-8')
            for pos in pos.splitlines():
                pos = re.sub('\ ', '|', pos)
                print u"".join([field[0], "|", field[1], "|POS_03|",
                                pos]).encode('utf-8').strip()
        except (UnicodeDecodeError, UnicodeEncodeError, IndexError,
                AssertionError):
            # Tag failed lines NA to enable repair
            print "".join([field[0], "|", field[1], "|POS_03", "|NA"])

# I. Close the file
fp.close()

# EOF
Ejemplo n.º 17
0
            raise

# tt_en = TreeTagger(encoding='utf-8', language='english')
# pprint(tt_en.tag('Does this thing work?'))

tagger = ttw.TreeTagger(TAGLANG='de', TAGDIR='/home/niklas/treetagger/')
# satz = u'Dies ist ein Testsatz.'
# print type(satz)
# satzu = satz.decode('utf-8')
# tags = tagger.tag_text(satz)
# pprint.pprint(tags)

datei = open('196.txt', 'r')
dat = datei.read()

s = parse(dat, tagset='STTS')
s = split(s)
print s.sentences[0]
print predicative('neugierige')

with open('196.txt', 'r') as openfile:
    for line in openfile:
        nltk.tag.brill.BrillTagger(line)

# datu = dat.decode('utf-8')
# print tagger.tag_text(dat)
# print datu
# tags = tagger.TagText(datu)
# # for tag in tags:
# #     print tag
datei.close()