Beispiel #1
0
# PRON = "PR" (pronoun)
#  DET = "DT" (determiner)
# PREP = "PP" (preposition)
#  NUM = "NO" (number)
# CONJ = "CJ" (conjunction)
# INTJ = "UH" (interjection)
#  PRT = "PT" (particle)
# PUNC = "."  (punctuation)
#    X = "X"  (foreign word, abbreviation)

# We can combine this with the multilingual pattern.text.parse() function,
# when we need to deal with code that handles many languages at once:

from pattern.text import parse

print(parse("die schwarzen Katzen", chunks=False, language="de", tagset=UNIVERSAL))
print(parse("the black cats"      , chunks=False, language="en", tagset=UNIVERSAL))
print(parse("los gatos negros"    , chunks=False, language="es", tagset=UNIVERSAL))
print(parse("les chats noirs"     , chunks=False, language="fr", tagset=UNIVERSAL))
print(parse("i gatti neri"        , chunks=False, language="it", tagset=UNIVERSAL))
print(parse("de zwarte katten"    , chunks=False, language="nl", tagset=UNIVERSAL))
print()

# This comes at the expense of (in this example) losing information about plural nouns (NNS => NN).
# But it may be more comfortable for you to build multilingual apps 
# using the universal constants (e.g., PRON, PREP, CONJ), 
# instead of learning the Penn Treebank tagset by heart,
# or wonder why the Italian "che" is tagged "PRP", "IN" or "CC"
# (in the universal tagset it is a PRON or a CONJ).

from pattern.text import parsetree
Beispiel #2
0
# PREP = "PP" (preposition)
#  NUM = "NO" (number)
# CONJ = "CJ" (conjunction)
# INTJ = "UH" (interjection)
#  PRT = "PT" (particle)
# PUNC = "."  (punctuation)
#    X = "X"  (foreign word, abbreviation)

# We can combine this with the multilingual pattern.text.parse() function,
# when we need to deal with code that handles many languages at once:

from pattern.text import parse

print(
    parse("die schwarzen Katzen",
          chunks=False,
          language="de",
          tagset=UNIVERSAL))
print(parse("the black cats", chunks=False, language="en", tagset=UNIVERSAL))
print(parse("los gatos negros", chunks=False, language="es", tagset=UNIVERSAL))
print(parse("les chats noirs", chunks=False, language="fr", tagset=UNIVERSAL))
print(parse("i gatti neri", chunks=False, language="it", tagset=UNIVERSAL))
print(parse("de zwarte katten", chunks=False, language="nl", tagset=UNIVERSAL))
print("")

# This comes at the expense of (in this example) losing information about plural nouns (NNS => NN).
# But it may be more comfortable for you to build multilingual apps
# using the universal constants (e.g., PRON, PREP, CONJ),
# instead of learning the Penn Treebank tagset by heart,
# or wonder why the Italian "che" is tagged "PRP", "IN" or "CC"
# (in the universal tagset it is a PRON or a CONJ).
Beispiel #3
0
    def extract_candidates(self):
        if not os.path.exists('SubjectObject.txt'):
            sentences = sent_tokenize(self.complete_processed_corpus)
            orig_stdout = sys.stdout
            f = file('SubjectObject.txt', 'a')
            sys.stdout = f
            for sent in sentences:
                datas = parse(sent, relations=True, lemmata=True)
                print pprint(datas)
            sys.stdout = orig_stdout
            f.close()

        My_Tupples = []

        with open('SubjectObject.txt', 'r') as f:
            classifiedText = f.readlines()
            for line in classifiedText:
                line = line.replace("^", "")
                if line != '' and line.find("                    ") == -1:
                    tuple = (line.split())
                    My_Tupples.append(tuple)
            Subjects = list()
            Objects = list()
            counter = 1
            dict = {}
            tmp = list()
            subterm = ''
            objterm = ''
            sentcounter = 1
            isContiguous = False
            My_Tupples.remove([])
            My_Tupples.remove(['None'])
            for mylst in My_Tupples:
                if mylst != [] and mylst != ['None']:
                    if mylst[
                            0] == "WORD":  # this means it is a part of one sentence until new 'WORD' term is hit
                        if len(Objects) > 0:
                            if len(Subjects) > 0:
                                dict["S" + str(sentcounter)] = (
                                    copy.deepcopy(Subjects),
                                    copy.deepcopy(Objects))
                                sentcounter = sentcounter + 1
                                Subjects = []
                                Objects = []
                            else:
                                dict["S" + str(sentcounter)] = (
                                    list(), copy.deepcopy(Objects))
                                sentcounter = sentcounter + 1
                                Subjects = []
                                Objects = []
                        elif len(Subjects) > 0:
                            dict["S" +
                                 str(sentcounter)] = (copy.deepcopy(Subjects),
                                                      list())
                            sentcounter = sentcounter + 1
                            Subjects = []
                            Objects = []
                        continue
                    elif mylst[3] == "SBJ":
                        if (isContiguous):
                            subterm += mylst[0] + ' '
                        else:
                            subterm = mylst[0] + ' '
                            isContiguous = True
                    elif mylst[3] == "OBJ":
                        if (isContiguous):
                            objterm += mylst[0] + ' '
                        else:
                            objterm = mylst[0] + ' '
                            isContiguous = True
                    else:
                        isContiguous = False
                        if len(objterm) > 0:
                            Objects.append(objterm.rstrip())
                        if len(subterm) > 0:
                            Subjects.append(subterm.rstrip())
                        subterm = ''
                        objterm = ''
            self.subject_object_dict = copy.deepcopy(dict)
            grammar = ChunkRule("<JJ>?<NN>*<NNS>?<NNP>*<VBN>*<VB>*",
                                "Feature Term")
            cp = RegexpChunkParser([grammar], chunk_label='FeatureTerm')
            sentences = sent_tokenize(self.complete_processed_corpus)
            for sent in sentences:
                fwords = nltk.word_tokenize(sent)
                sentence = nltk.pos_tag(fwords)
                parsetree = cp.parse(sentence)
                featureterms = list(
                    parsetree.subtrees(
                        filter=lambda x: x.label() == 'FeatureTerm'))

                for term in featureterms:
                    featureterm = ''
                    for leaf in term.leaves():
                        featureterm = featureterm + str(leaf[0]) + ' '
                    featureterm = featureterm.strip()
                    self.CandidateTerms.append(featureterm)
            for i in range(len(dict)):
                for t in dict["S" + str(i + 1)]:
                    if len(t) > 0:
                        if (nltk.pos_tag(sw) in ('DT', 'PRP', 'PRP$', 'PDT')
                                for sw in t[0]):
                            # find the previous noun and replace
                            lastnoun = self.GetLastNoun()
                            for ww in t[0]:
                                if nltk.pos_tag(ww) in ('DT', 'PRP', 'PRP$',
                                                        'PDT'):
                                    t[0] = t[0] + lastnoun + ' '
                                else:
                                    t[0] = t[0] + ww + ' '
                            self.CandidateTerms.append(t)
                        else:
                            self.CandidateTerms.append(w for w in t[0])
                            self.CandidateTerms.append(w for w in t[1])
            with open("CandidateTerms.txt", "w") as ct:
                ct.write(str(self.CandidateTerms).strip('[]'))
        self.FeatureGroups = self.processSimilarity()