# PRON = "PR" (pronoun) # DET = "DT" (determiner) # PREP = "PP" (preposition) # NUM = "NO" (number) # CONJ = "CJ" (conjunction) # INTJ = "UH" (interjection) # PRT = "PT" (particle) # PUNC = "." (punctuation) # X = "X" (foreign word, abbreviation) # We can combine this with the multilingual pattern.text.parse() function, # when we need to deal with code that handles many languages at once: from pattern.text import parse print(parse("die schwarzen Katzen", chunks=False, language="de", tagset=UNIVERSAL)) print(parse("the black cats" , chunks=False, language="en", tagset=UNIVERSAL)) print(parse("los gatos negros" , chunks=False, language="es", tagset=UNIVERSAL)) print(parse("les chats noirs" , chunks=False, language="fr", tagset=UNIVERSAL)) print(parse("i gatti neri" , chunks=False, language="it", tagset=UNIVERSAL)) print(parse("de zwarte katten" , chunks=False, language="nl", tagset=UNIVERSAL)) print() # This comes at the expense of (in this example) losing information about plural nouns (NNS => NN). # But it may be more comfortable for you to build multilingual apps # using the universal constants (e.g., PRON, PREP, CONJ), # instead of learning the Penn Treebank tagset by heart, # or wonder why the Italian "che" is tagged "PRP", "IN" or "CC" # (in the universal tagset it is a PRON or a CONJ). from pattern.text import parsetree
# PREP = "PP" (preposition) # NUM = "NO" (number) # CONJ = "CJ" (conjunction) # INTJ = "UH" (interjection) # PRT = "PT" (particle) # PUNC = "." (punctuation) # X = "X" (foreign word, abbreviation) # We can combine this with the multilingual pattern.text.parse() function, # when we need to deal with code that handles many languages at once: from pattern.text import parse print( parse("die schwarzen Katzen", chunks=False, language="de", tagset=UNIVERSAL)) print(parse("the black cats", chunks=False, language="en", tagset=UNIVERSAL)) print(parse("los gatos negros", chunks=False, language="es", tagset=UNIVERSAL)) print(parse("les chats noirs", chunks=False, language="fr", tagset=UNIVERSAL)) print(parse("i gatti neri", chunks=False, language="it", tagset=UNIVERSAL)) print(parse("de zwarte katten", chunks=False, language="nl", tagset=UNIVERSAL)) print("") # This comes at the expense of (in this example) losing information about plural nouns (NNS => NN). # But it may be more comfortable for you to build multilingual apps # using the universal constants (e.g., PRON, PREP, CONJ), # instead of learning the Penn Treebank tagset by heart, # or wonder why the Italian "che" is tagged "PRP", "IN" or "CC" # (in the universal tagset it is a PRON or a CONJ).
def extract_candidates(self): if not os.path.exists('SubjectObject.txt'): sentences = sent_tokenize(self.complete_processed_corpus) orig_stdout = sys.stdout f = file('SubjectObject.txt', 'a') sys.stdout = f for sent in sentences: datas = parse(sent, relations=True, lemmata=True) print pprint(datas) sys.stdout = orig_stdout f.close() My_Tupples = [] with open('SubjectObject.txt', 'r') as f: classifiedText = f.readlines() for line in classifiedText: line = line.replace("^", "") if line != '' and line.find(" ") == -1: tuple = (line.split()) My_Tupples.append(tuple) Subjects = list() Objects = list() counter = 1 dict = {} tmp = list() subterm = '' objterm = '' sentcounter = 1 isContiguous = False My_Tupples.remove([]) My_Tupples.remove(['None']) for mylst in My_Tupples: if mylst != [] and mylst != ['None']: if mylst[ 0] == "WORD": # this means it is a part of one sentence until new 'WORD' term is hit if len(Objects) > 0: if len(Subjects) > 0: dict["S" + str(sentcounter)] = ( copy.deepcopy(Subjects), copy.deepcopy(Objects)) sentcounter = sentcounter + 1 Subjects = [] Objects = [] else: dict["S" + str(sentcounter)] = ( list(), copy.deepcopy(Objects)) sentcounter = sentcounter + 1 Subjects = [] Objects = [] elif len(Subjects) > 0: dict["S" + str(sentcounter)] = (copy.deepcopy(Subjects), list()) sentcounter = sentcounter + 1 Subjects = [] Objects = [] continue elif mylst[3] == "SBJ": if (isContiguous): subterm += mylst[0] + ' ' else: subterm = mylst[0] + ' ' isContiguous = True elif mylst[3] == "OBJ": if (isContiguous): objterm += mylst[0] + ' ' else: objterm = mylst[0] + ' ' isContiguous = True else: isContiguous = False if len(objterm) > 0: Objects.append(objterm.rstrip()) if len(subterm) > 0: Subjects.append(subterm.rstrip()) subterm = '' objterm = '' self.subject_object_dict = copy.deepcopy(dict) grammar = ChunkRule("<JJ>?<NN>*<NNS>?<NNP>*<VBN>*<VB>*", "Feature Term") cp = RegexpChunkParser([grammar], chunk_label='FeatureTerm') sentences = sent_tokenize(self.complete_processed_corpus) for sent in sentences: fwords = nltk.word_tokenize(sent) sentence = nltk.pos_tag(fwords) parsetree = cp.parse(sentence) featureterms = list( parsetree.subtrees( filter=lambda x: x.label() == 'FeatureTerm')) for term in featureterms: featureterm = '' for leaf in term.leaves(): featureterm = featureterm + str(leaf[0]) + ' ' featureterm = featureterm.strip() self.CandidateTerms.append(featureterm) for i in range(len(dict)): for t in dict["S" + str(i + 1)]: if len(t) > 0: if (nltk.pos_tag(sw) in ('DT', 'PRP', 'PRP$', 'PDT') for sw in t[0]): # find the previous noun and replace lastnoun = self.GetLastNoun() for ww in t[0]: if nltk.pos_tag(ww) in ('DT', 'PRP', 'PRP$', 'PDT'): t[0] = t[0] + lastnoun + ' ' else: t[0] = t[0] + ww + ' ' self.CandidateTerms.append(t) else: self.CandidateTerms.append(w for w in t[0]) self.CandidateTerms.append(w for w in t[1]) with open("CandidateTerms.txt", "w") as ct: ct.write(str(self.CandidateTerms).strip('[]')) self.FeatureGroups = self.processSimilarity()