def extract_feature_using_senna(line): ''' Takes line in and data out :param line: an english sentence :return: data : [word, number, pos, chunk_info, ner_info, parent_number] ''' annotator = Annotator() feature = annotator.getAnnotations(line, dep_parse=True) if feature['dep_parse'] == '': return -2 a = feature['dep_parse'].split('\n') words_data = {} d = [] for i in a: dep = re.sub(r'^[^\(]+\(|\)$', '', i) try: p, c = dep.split(', ') except ValueError: pass try: t1 = p.split('-') pid = int(t1[len(t1) - 1]) t2 = c.split('-') wid = int(t2[len(t2) - 1]) except ValueError: if re.match('[\d]+\'', t1[len(t1) - 1]): pid = int(re.sub(r'\'', '', t1[len(t1) - 1])) + 0.1 t2 = c.split('-') wid = int(t2[len(t2) - 1]) elif re.match('[\d]+\'', t2[len(t2) - 1]): pass continue d.append((wid, pid)) t1 = [id for id in d] d, _ = remove_dep(t1) for wid, pid in d: add_ids(words_data, wid, pid) for i in range(len(feature['words'])): if i + 1 not in words_data: words_data[i + 1] = { 'wid': i + 1, 'pid': -1, 'word': feature['words'][i], 'chk': feature['chunk'][i][1], 'ner': feature['ner'][i][1], 'pos': feature['pos'][i][1] } elif i + 1 in words_data: words_data[i + 1]['word'] = feature['words'][i] words_data[i + 1]['chk'] = feature['chunk'][i][1] words_data[i + 1]['ner'] = feature['ner'][i][1] words_data[i + 1]['pos'] = feature['pos'][i][1] words_data['syntax_tree'] = feature['syntax_tree'] words_data['tree'] = feature['tree'] words_data['verbs'] = feature['verbs'] words_data['srl'] = feature['srl'] # Global.accepted += 1 return words_data
def test_tree4(): annotator = Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." sent = "Bachelor 's degree in computer science , design or related field." sent = "B.S. in Computer Science , a related degree or its equivalent" sent = "BS , MS , or PhD in Computer Science or a similar field preferred" sent = "Computer Science or related technical degree from an accredited four year university " sent = "Degree in Computer Science or Engineering with a high GPA ." sent = "A Master's degree in Computer Science or Engineering is mandatory ." sent = "A Computer Science or related degree " sent = "I love science and SciFi book" sent = "I love music and SciFi book" result = annotator.getAnnotations(sent) tree_str = result['syntax_tree'] print print tree_str tree = Tree.fromstring(tree_str)[0] print print "Root label=", tree.label() tree.draw()
def getword2vec(self, raw_text): sentences = nltk.sent_tokenize(raw_text) annotator = Annotator() counter = 0 doc_vec = [] for sentence in sentences: # get semantic role labelling data for each sentence srl = list(annotator.getAnnotations(sentence)['srl']) word2vec = [] # get the event structure for each sentence for s in srl: if 'V' in s: # print s['V'] word2vec = self.getvector(s['V']) # print word2vec else: print 'No verb found in sentence' return if 'A0' in s: # print s['A0'] word2vec = self.addVectors(word2vec, self.getvector(s['A0'])) if 'A1' in s: # print s['A1'] word2vec = self.addVectors(word2vec, self.getvector(s['A1'])) if counter == 0: doc_vec = word2vec else: doc_vec = self.addVectors(doc_vec, word2vec) counter = counter + 1
def get_shortest_path(a, b): text = a + b annotator = Annotator() dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse'] dp_list = dep_parse.split('\n') pattern = re.compile(r'.+?\((.+?), (.+?)\)') edges = [] for dep in dp_list: m = pattern.search(dep) edges.append((m.group(1), m.group(2))) graph = nx.Graph(edges) shortest_paths = [] a = a.strip() b = b.strip() a = a.split() b = b.split() for i in a: for j in b: shortest_paths.append(nx.shortest_path_length(graph, source=i, target=j)) print(shortest_paths)
def test_tree4(): annotator=Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." sent = "Bachelor 's degree in computer science , design or related field." sent = "B.S. in Computer Science , a related degree or its equivalent" sent = "BS , MS , or PhD in Computer Science or a similar field preferred" sent = "Computer Science or related technical degree from an accredited four year university " sent = "Degree in Computer Science or Engineering with a high GPA ." sent = "A Master's degree in Computer Science or Engineering is mandatory ." sent = "A Computer Science or related degree " sent = "I love science and SciFi book" sent = "I love music and SciFi book" result = annotator.getAnnotations(sent) tree_str = result['syntax_tree'] print print tree_str tree = Tree.fromstring(tree_str)[0] print print "Root label=",tree.label() tree.draw()
def compute_POS(line): annotator = Annotator() pos = annotator.getAnnotations(line)['pos'] pos_tag = [] for p in pos: pos_tag.append(p[1]) return pos_tag
def test1(): annotator=Annotator() sent = "There are people dying make this world a better place for you and for me." result = annotator.getAnnotations(sent) #print result print type (result) print result.keys()
def test_deep(): annotator=Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." result = annotator.getAnnotations(sent,dep_parse=True) print result['dep_parse']
def srl(self, sen_dest): ant = Annotator() if sen_dest.upper().split()[0] == "UP": v = sen_dest.upper().split() v[0] = "RAISE" sen_dest = str(v) sen_srl = ant.getAnnotations(sen_dest)['syntax_tree'] return sen_srl
def get_annotations(question): annotator = Annotator() annotations = annotator.getAnnotations(question) srl = annotations['srl'] verbs = annotations['verbs'] ner = annotations['ner'] chunk = annotations['chunk'] return srl, verbs, ner, chunk
def test1(): annotator = Annotator() sent = "There are people dying make this world a better place for you and for me." result = annotator.getAnnotations(sent) #print result print type(result) print result.keys()
def annotation(self, n): parsed_heading = self.get_parsed_heading() annotator = Annotator() try: annotation = annotator.getAnnotations(parsed_heading[n]) return annotation except: pass
def test_deep(): annotator = Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." result = annotator.getAnnotations(sent, dep_parse=True) print result['dep_parse']
def semantic_role_label(): lemmatizer = WordNetLemmatizer() verbs_target = ["ensnare", "infect", "use", "target"] verbs_tool = ["attack"] # sent = "Mirai-Based Masuta Botnet Weaponizes Old Router Vulnerability. By Ionut Arghire on January 24, 2018. inShare. A new piece of malware based on Mirai's publicly released source code has been observed at large, ensnaring devices into a botnet, targeted Internet of Things. Known as Masuta, the botnet has at least two variants at large, and is believed to be the work of a well-known IoT threat actor, NewSky Security says. What?s also unique to the botnet is that it exploits an old router vulnerability, being the first threat known to weaponize it in a botnet campaign. Masuta (Japanese for master) botnet?s source code was found on an invite only dark forum. The malware?s configuration file, the researchers discovered, uses a different seed of the cipher key compared to Mirai, having the strings in the configuration files XORed by 0x45. Thus, the researchers discovered that it uses the domain nexusiotsolutions(dot)net, the command and control (C&C) server that Nexus Zeta, the individual involved in the recent Satori attacks, uses. The domain was registered using the nexuszeta1337@gmail(.)com email address. Thus, NewSky Security suggests that Nexus Zeta has been involved in the creation of the Masuta botnet, in addition to building Satori, the Mirai variant that has been wreaking havoc over the past couple of months. In fact, Masuta isn?t new either, and attacks involving it have been steadily increasing since September, and the botnet?s standard variant has been observed using several known/weak/default credentials to compromise IoT devices. An evolved variant of Masuta, called PureMasuta, contains the most typical of Mirai style code, and a list of weak credentials to use. What makes this malware variant stand out, however, is its usage of EDB 38722 D-Link exploit. The exploit PureMasuta uses resides in the HNAP (Home Network Administration Protocol), which is based on the SOAP protocol. It is possible to craft a SOAP query to bypass authentication by using hxxp://purenetworks.com/HNAP1/GetDeviceSettings, and improper string handling can lead to arbitrary code execution, and an attacker can abuse this combination of issues to run code on targeted devices. What the botnet does is to download a shell script from the C&C server and run it. Thus, the malware author first bypasses authentication and then executes code on the targeted devices. The PureMasuta variant uses the same C&C server (93.174.93.63) as the original Masuta variant, which led the researchers to believe it is the evolved creation of the same threat actor. Nexus Zeta is no stranger when it comes to implementing SOAP related exploits. The threat actor has already been observed in implementing two other known SOAP related exploits, CVE-2014-8361 and CVE-2017-17215 in his Satori botnet project, NewSky Security notes. Thus, the TR-069 bug and EDB 38722 are the third and fourth SOAP related exploits abused by IoT botnets. Protocol exploits are more desirable for threat actors as they usually have a wider scope. A protocol can be implemented by various vendors/models and a bug in the protocol itself can get carried on to a wider range of devices, the researchers conclude." # sent = "Mirai, the infamous botnet used in the recent massive distributed denial of service (DDoS) attacks against Brian Krebs' blog and Dyn's DNS infrastructure, has ensnared Internet of Things (IoT) devices in 164 countries, researchers say." if len(sys.argv) != 2: print("NOPE") exit() fh = open(sys.argv[1], "r") sent = fh.read() # sent = sys.argv[1] target = "" tools = "" for s in [i.strip() for i in sent.split(".")]: a = Annotator() b = a.getAnnotations(s.encode('utf-8')) dictlist = b['srl'] for dict in dictlist: if 'V' in dict: if lemmatizer.lemmatize(dict["V"].lower(), 'v') in verbs_target: temp1 = temp2 = "" if "A0" in dict and not dict['A0'] == "": temp1 += dict['A0'] # temp += " :|: " if "A1" in dict and not dict['A1'] == "": temp2 += dict['A1'] if not temp1 == "": temp1 = getTools(temp1) tools += temp1 + ":-----:" if not temp2 == "": temp2 = getTargets(temp2) target += temp2 + ":-----:" if lemmatizer.lemmatize(dict["V"].lower(), 'v') in verbs_tool: temp = "" if "A0" in dict and not dict['A0'] == "": temp += dict['A0'] # temp += "|" # if "A1" in dict: # temp += dict['A1'] if not temp == "": temp = getTools(temp) tools += temp + ":-----:" # print("SemanticRoleLabel :::: {}".format(b['srl'])) # print("2nd:\n{}".format([x, y] for x,y in [b['ner']])) print("Target :::: " + target) print("Tools :::: " + tools)
def complexQuery(term): #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query="+term+"&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp")) #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="+term)) annotator = Annotator() dep_parse = annotator.getAnnotations(term, dep_parse=True)['dep_parse'] dp_list = dep_parse.split('\n') #spotlightTerms = WordNet.spotlightSearch(term) #print "spotlight terms %s" %spotlightTerms #print "dp list %s" %dp_list spotlightTerms = spotlight.annotate( 'http://spotlight.sztaki.hu:2222/rest/annotate', term, confidence=0.3, support=20, spotter='Default') #print term, '\t', spotlightTerms[1].get('URI') #print spotlightTerms[0].get('URI') secondDep = "" query = [] for prep in dp_list: elementPrep = "prep" if elementPrep in prep: print("We found preposition1: %s" % prep[prep.find("_") + 1:prep.find("(")]) prepType = prep[prep.find("_") + 1:prep.find("(")] print("We found preposition2: %s" % prep[prep.find(" ") + 1:prep.find(")")]) secondDep = prep[prep.find(" ") + 1:prep.find(")")].split("-") print secondDep[0] query.append(prepType) query.append(secondDep[0]) if prepType == "like": results = DBPedia.dpbediaQuery(prepType, secondDep[0]) else: results = DBPedia.dpbediaQuery(prepType, spotlightTerms[1].get('URI')) print results for query in results: test = json.load( urllib2.urlopen( "http://www.freesound.org/apiv2/search/text/?query=" + query + "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp")) test2 = json.load( urllib2.urlopen( "https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name=" + query)) print(test) #print(test2) return test, test2
def __init__(self): self.practNLP_annotator = Annotator() self.tokenizer_sent = nltk.tokenize.sent_tokenize self.pool = Pool(self.nThreads) self.sent_tokenizers = [] self.annotators = [] for i in xrange(self.nThreads): self.sent_tokenizers.append(nltk.tokenize.sent_tokenize) self.annotators.append(Annotator())
def compute_Chunks(corpus): Chunk_Tags=[] annotator=Annotator() for sentence in corpus: chunks=annotator.getAnnotations(sentence)['chunk'] chunk="" for elem in chunks: chunk=chunk+elem[1]+" " Chunk_Tags.append(chunk) return Chunk_Tags
def compute_Chunks(corpus): Chunk_Tags = [] annotator = Annotator() for sentence in corpus: chunks = annotator.getAnnotations(sentence)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " Chunk_Tags.append(chunk) return Chunk_Tags
def SRLAnnotation( self, sentence ): """ Use SENNA library to perform SRL(semantic role labelling) on specific sentence. :param sentence: the specific sentence to be handled :type sentence: str :return: :rtype: list({}) """ annotator = Annotator() return annotator.getAnnotations( sentence )["srl"]
def compute_Chunks(corpus): Chunk_Tags = [] annotator = Annotator() #for sentence in corpus: chunks = annotator.getAnnotations(corpus)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " # print chunk To see what these chucks are Chunk_Tags.append(chunk) return Chunk_Tags
def test3(): annotator=Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." result = annotator.getAnnotations(sent) print result["pos"] print result['ner'] print result['chunk'] print result['verbs'] print result['srl']
def test3(): annotator = Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." result = annotator.getAnnotations(sent) print result["pos"] print result['ner'] print result['chunk'] print result['verbs'] print result['srl']
def compute_NER(corpus): NER = [] fi = open(read_property('NER_features_test_coarse_path'), "w") annotator = Annotator() for sentence in corpus: ners = annotator.getAnnotations(sentence)['ner'] ner = "" for elem in ners: ner = ner + elem[1] + " " print ner fi.write(ner + "\n") NER.append(ner) return NER
def draw_tree(): annotator = Annotator() result = annotator.getAnnotations(sent11) tree_str = result['syntax_tree'] print # print tree_str tree = Tree.fromstring(tree_str)[0] print tree.pprint() print print "Root label=", tree.label() tree.draw()
def draw_tree(): annotator=Annotator() result = annotator.getAnnotations(sent11) tree_str = result['syntax_tree'] print # print tree_str tree = Tree.fromstring(tree_str)[0] print tree.pprint() print print "Root label=",tree.label() tree.draw()
def compute_Chunks(corpus): #Chunk_Tags=[] fi = open(read_property('Chunk_features_test_path'), "w") annotator = Annotator() for sentence in corpus: chunks = annotator.getAnnotations(sentence)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " print chunk fi.write(chunk + "\n") #Chunk_Tags.append(chunk) #print "The bag of words for Chunks is ",Chunk_Tags fi.close()
def compute_Chunks(corpus): #Chunk_Tags=[] fi=open(read_property('Chunk_features_train_path'),"w") annotator=Annotator() for sentence in corpus: chunks=annotator.getAnnotations(sentence)['chunk'] chunk="" for elem in chunks: chunk=chunk+elem[1]+" " #print chunk fi.write(chunk+"\n") #Chunk_Tags.append(chunk) #print "The bag of words for Chunks is ",Chunk_Tags fi.close()
def compute_POS_Tags(corpus): #POS=[] fi = open(read_property('POS_features_test_coarse_path'), "w") annotator = Annotator() for sentence in corpus: pos_seq = annotator.getAnnotations(sentence)['pos'] #print pos_seq pos_tags = "" for pos in pos_seq: pos_tags = pos_tags + pos[1] + " " fi.write(pos_tags + "\n") print pos_tags ############### #POS.append(pos_tags) #print "The bag of words of POS is ",POS fi.close()
def rawfile_to_sentencefile_dir(): indir = sys.argv[1] outdir = sys.argv[2] counter = 1 try: os.makedirs(outdir) except: print('dir existed') pass time_start = time.time() annotator = Annotator() a = os.listdir(indir) part_data = os.listdir(indir)[int(sys.argv[3]):int(sys.argv[4])] # devide data into several parts,we need to set the start-end #part_data = os.listdir(indir)[0:8] for fname in part_data: if os.path.splitext(fname)[1] == '.summary': if not os.path.exists(os.path.join(outdir, fname.split('.')[0]+'.summary'+'.new')): #determine whether the file has been processed print(fname) #time_start = time.time() rawfile_to_sentencefile(annotator,os.path.join(indir, fname), os.path.join(outdir, fname+'.new')) counter = counter+1 #time_end = time.time() #print('totally cost', time_end - time_start, 'Number', counter) else: print('skip', fname ) time_end = time.time() print('totally cost: ', time_end - time_start, 'file number: ', counter-1)
def __init__(self, sentence, question, answer, nlp, srl=None): if srl == None: self.ascii_sentence = unicodedata.normalize('NFKD', sentence).encode( 'ascii', 'ignore') self.ascii_question = unicodedata.normalize('NFKD', question).encode( 'ascii', 'ignore') self.ascii_answer = unicodedata.normalize('NFKD', answer).encode( 'ascii', 'ignore') self.annotator = Annotator() self.srl = self.annotator.getAnnotations( self.ascii_sentence)['srl'] self.answer_srl_label = self.set_answer_srl_label() else: self.srl = srl self.nlp = nlp self.raw_sentence = sentence self.raw_question = question self.raw_answer = answer self.spacy_sent = self.nlp(self.raw_sentence) self.spacy_ques = self.nlp(self.raw_question) self.answer_length = self.set_answer_length() self.spacy_answer = self.set_spacy_answer() self.answer_pos = self.set_answer_pos() self.answer_ner = self.set_answer_ner() self.answer_ner_iob = self.set_answer_ner_iob() self.answer_depth = self.set_answer_depth() self.answer_word_count = self.set_answer_word_count() self.all_pos_tags = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'PUNCT' ] self.all_ner_tags = [ 'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL' ] self.all_srl_labels = [ 'V', 'A0', 'A1', 'A2', 'C-arg', 'R-arg', 'AM-ADV', 'AM-DIR', 'AM-DIS', 'AM-EXT', 'AM-LOC', 'AM-MNR', 'AM-MOD', 'AM-NEG', 'AM-PNC', 'AM-PRD', 'AM-PRP', 'AM-REC', 'AM-TMP' ]
def detector(mail,outputfileName): annotator=Annotator() sentences = nltk.sent_tokenize(mail) probableActionItemSentences = [] for sentence in sentences: text = nltk.word_tokenize(sentence) posTags = nltk.pos_tag(text) for tags in posTags: if tags[1]=="VB": probableActionItemSentences.append(sentence) break for sentence in probableActionItemSentences: srLabels = annotator.getAnnotations(sentence)['srl'] #print(srLabels) depParsedContent = annotator.getAnnotations(sentence,dep_parse=True) #print(depParsedContent) root = depParsedContent['dep_parse'] root = root[root.find('root('):] root = root[:root.find('\n')] root = root[root.find(',')+2:root.rfind('-')] parsedList = depParsedContent['srl'] owner = None ownerFound = False for parsedMap in parsedList: if 'V' in parsedMap and parsedMap['V'] == root: if 'A0' in parsedMap: owner = parsedMap['A0'] ownerFound = True else: owner = 'You' ownerFound = True break if not ownerFound: for parsedMap in parsedList: if 'A0' in parsedMap: ownerFound = True if parsedMap['A0'].lower() == 'you' or parsedMap['A0'].lower() == 'we' or parsedMap['A0'].lower() == 'us': owner = parsedMap['A0'] ownerFound = True break if ownerFound and owner==None: print("") else: outputfile.write("OWNER : "+owner+" SENTENCE : "+sentence+"\n")
def get_filtered_events(self, doc_id, raw_text, entity): filtered_events = [] sentences = nltk.sent_tokenize(raw_text) annotator = Annotator() # for each sentence determine if it pertains to the entity for sentence in sentences: # get semantic role labelling data for each sentence events = self.get_filtered_verbs_with_vecs( doc_id, list(annotator.getAnnotations(sentence)['srl']), entity) # print events.__len__() if events.__len__() > 0: filtered_events.extend(events) # return list of events return filtered_events
def readfile(self, num_of_lines): n = 0 annotator = Annotator() with open(self.path + self.file_name) as f: reader = csv.reader(f) for l in reader: if n < num_of_lines: line = l[0].split("\t") heading = line[0] true_passage = line[1] false_passage = line[2:] document = Document(heading, true_passage, false_passage) anno = annotator.getAnnotations( document.get_parsed_heading()[0]) pprint(anno) n += 1 else: break
def test_tree2(): annotator = Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." sent = "Bachelor 's degree in computer science , design or related field." result = annotator.getAnnotations(sent) tree_str = result['syntax_tree'] print result['syntax_tree'] print "--------------------" tree2 = Tree(tree_str) print len(tree2) print "--------------------" for item in tree2[0]: print type(item) print item
def extract_wpcn_feature_using_senna(line): ''' Takes line in and data out :param line: an english sentence :return: data : [word, number, pos, chunk_info, ner_info, parent_number] ''' annotator = Annotator() feature = annotator.getAnnotations(line, dep_parse=False) words_data = {} for i in range(len(feature['words'])): words_data[i + 1] = { 'wid': i + 1, 'word': feature['words'][i], 'chk': feature['chunk'][i][1], 'ner': feature['ner'][i][1], 'pos': feature['pos'][i][1] } return words_data
def check_neg(text, keyword): annotator = Annotator() dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse'] dp_list = dep_parse.split('\n') pattern = re.compile(r'.+?\((.+?), (.+?)\)') edges = [] for dep in dp_list: m = pattern.search(dep) word1 = m.group(1).split('-')[0] word2 = m.group(2).split('-')[0] # print word1, word2 if (word1 == keyword and word2 in neg_words) or (word1 in neg_words and word2 == keyword): return 1 return 0
def test_tree2(): annotator=Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." sent = "Bachelor 's degree in computer science , design or related field." result = annotator.getAnnotations(sent) tree_str = result['syntax_tree'] print result['syntax_tree'] print "--------------------" tree2 = Tree(tree_str) print len(tree2) print "--------------------" for item in tree2[0]: print type(item) print item
def process(self, text): annotator = Annotator() tokens = annotator.getAnnotations(text) chunks = tokens['chunk'] ner = tokens['ner'] pos = tokens['pos'] srl = tokens['srl'] words = tokens['words'] for part in srl: self.assign_objects(part) returnCode = "" print self.objects for object in self.objects: print self.get_item(object) if self.get_item(object) is not None: returnCode += str(self.get_item(object)) return returnCode
def preprocess(infile, outfile, posfile, index): annotator = Annotator() wnl = WordNetLemmatizer() o = open(outfile, 'w'); p = open(posfile, 'w'); f = open(infile) text = f.readlines() for s in text: s = s.strip().split("\t", index) # make it lower sent = s[index].lower() # remove special characters sent = sent.strip(string.punctuation) # extend contractions sent = re.sub(r"n't", " not", sent) sent = re.sub(r"'ve", " have", sent) sent = re.sub(r"'d", " would", sent) sent = re.sub(r"'ll", " will", sent) sent = re.sub(r"'m", " am", sent) sent = re.sub(r"'s", " is", sent) sent = re.sub(r"'re", " are", sent) # lematize and get POS tags pos = annotator.getAnnotations(sent)["pos"] lemmas = [wnl.lemmatize(w,'v') if t.startswith('V') else wnl.lemmatize(w, 'n') for (w,t) in pos] sent = " ".join(lemmas) pos = " ".join([x[1] for x in pos]) out_string = "" pos_string = "" for j in range(0,index): out_string += s[j] + "\t" pos_string += s[j] + "\t" out_string += sent + "\n" pos_string += pos + "\n" o.write(out_string) p.write(pos_string) f.close() o.close() o.close()
class srlGraph(Base): def __init__(self, docpath): super(srlGraph, self).__init__(docpath) self.stopwords = stopwords.words("english") self.annotator = Annotator() def srl_corpus_extraction(self, stopwords=None): sem_rl = self.annotator.getAnnotations(self.corpus, dep_parse=True)['srl'] srl_corpus = [ self.annotator.getAnnotations(doc)['srl'] for doc in self.corpus ] return srl_corpus def srl_document_extraction(self, document_id=0, stopwords=None): return self.annotator.getAnnotations(self.corpus[document_id], dep_parse=True)['srl'] def get_doc_canon(self, document_id): sem_rl = self.annotator.getAnnotations(self.corpus[document_id], dep_parse=True)['srl'] canon = [ en.singularize(word) for word in str(TextBlob(sem_rl[0]['C-A1'])).split() if word not in stopwords ] blob = TextBlob(sem_rl[0]['A1']) nounPhrases = blob.noun_phrases.singularize() sr_verb_concept = self.annotator.getAnnotations(sem_rl[0]['A1'])['srl'] concat_noun_concepts = set( sum([word.split() for word in nounPhrases], [])) predicates = list(set(canon) - concat_noun_concepts) return canon
class Worker_NLP: ''' worker class for NLP ''' nThreads = 4 pool = None ith = 0 tokenizer_sent = None practNLP_annotator = None sent_tokenizers = None annotators = None def __init__(self): self.practNLP_annotator = Annotator() self.tokenizer_sent = nltk.tokenize.sent_tokenize self.pool = Pool(self.nThreads) self.sent_tokenizers = [] self.annotators = [] for i in xrange(self.nThreads): self.sent_tokenizers.append(nltk.tokenize.sent_tokenize) self.annotators.append(Annotator()) def getSRL(self, rawTxt): sents = sent_tokenize(rawTxt) srls = [] for sent in sents: srl = self.practNLP_annotator.getAnnotations(sent)['srl'] srls.append(srl) return srls def getSRL_parallel(self, rawTxts): tasks = [] #[rawTxt, tokenizer, annotator] for i, rawTxt in enumerate(rawTxts): tokenizer_tmp = self.sent_tokenizers[self.ith] annotator_tmp = self.annotators[self.ith] self.ith = (self.ith + 1) % self.nThreads tasks.append((rawTxt, tokenizer_tmp, annotator_tmp)) results = [] for task in tasks: result = self.pool.apply_async(compute_task_atom, args=task) results.append(result) srls_results = [item.get() for item in results] return srls_results
#!usr/bin/python ''' Python Script for SENNA functionality Uses PracNLPTools for Sementic Role Labeling NOT USED ''' import csv import re from practnlptools.tools import Annotator annotator=Annotator() print("Running Shallow Semantic Parser") patternForSymbol = re.compile(r'(\ufeff)', re.U) comments=[] #reads in CSV file with open('Dataset/dataset.csv','rb') as dataFile: reader = csv.reader(dataFile, delimiter=',') for row in reader: #row[0] = row[0].decode('utf-8') rowEdited = re.sub(patternForSymbol, '', row[0]) comment = rowEdited if rowEdited != "" else row[0] sentiment = row[1] comments.append((comment, sentiment)) for index,comment in enumerate(comments): if(index<100): print comment[0] print(annotator.getAnnotations(comment[0])['srl']) print("==========================")
__author__ = 'hz' from practnlptools.tools import Annotator text = "Disclosed is an organic light-emitting diode (OLED) display panel. An OLED display panel includes a plurality of signal lines and a thin film transistor formed on a substrate, an interlayer insulating layer, a first electrode, a bank, an organic light-emitting layer, a second electrode, a first passivation layer, an organic layer, a second passivation layer and a barrier film, wherein the bank is formed to completely cover the interlayer insulating layer, and an inclination formed by side surfaces of the bank and the interlayer insulating layer is made to be gradual." # text = "Disclosed is an organic light-emitting diode (OLED) display panel." # semantic role labelling text = 'Unlike the classic PPP technique, in our new approach, the ionospheric-free linear combination is not used.' annotator = Annotator() result = annotator.getAnnotations( text )["srl"] print( type(result) ) print( result )
def _fit(self, sent_list_of_str, dep_parse): '''Return annotations from a list of strings, as a list of dicts dep_parse is dependency parsing optional feature (takes a long time)''' annotator = Annotator() return annotator.getBatchAnnotations(sent_list_of_str, dep_parse)
def test2(): sent = "There are people dying make this world a better place for you and for me." annotator=Annotator() result = annotator.getAnnotations(sent,dep_parse=True) print result
#Pre-defining a set of desired verbs path = "C:/Users/hp/Desktop/bigdata/txt/" data = [] verb_file = open("./set3.txt", "rw+") desired_verbs = verb_file.readlines() #Performing Stemming stemmed_desired_verbs=[] stemmer=stem.snowball.EnglishStemmer() for word in desired_verbs: stemmed_desired_verbs.append(stemmer.stem(word)) annotator=Annotator() #Implementation of Semantic Role Labelling f = open('out.csv', 'wt') csv.register_dialect('lineterminator',lineterminator='\n') writer = csv.writer(f, dialect = csv.get_dialect('lineterminator')) writer.writerow( ('A0', 'A1', 'V', 'fileName')) for filename in os.listdir(path): print 'reading', filename text_file = open(path + filename,"r") file_content = text_file.readlines() data.append((filename,file_content)) text_file.close() #Annotating the striped text
def generate_shortestpath (sent,left_term,left_start,right_term,right_start): annotator = Annotator() #print "========before:",left_term,left_start,right_term,right_start ''' left_sent=" ".join(sent.split()[:left_start]) right_sent=" ".join(sent.split()[:right_start]) if re.search("[A-Za-z]-[A-Za-z]",left_term): info=left_term.split("-") left_term=info[-1] #left_start=left_start-1 if re.search("[A-Za-z]-[A-Za-z]",right_term): info=right_term.split("-") right_term=info[-1] # right_start=right_start-1 ''' right_term=re.sub('\(','LRB',right_term) right_term=re.sub('\)','RRB',right_term) left_term=re.sub('\(','LRB',left_term) left_term=re.sub('\)','RRB',left_term) new_left_start,new_left_term,new_right_start,new_right_term=update_loc(sent,left_start,right_start) ''' # adjust start coordination # denpendency parser wil split " - " and "'s" # print "left: ", left_sent # print "right: ",right_sent poss=re.compile('\w+\'s') conj=re.compile('[A-Za-z]+\-[A-Za-z]+') comma=re.compile('\w,') result_poss_left=poss.findall(left_sent) result_conj_left=conj.findall(left_sent) result_comma_left=comma.findall(left_sent) left_start=left_start+len(result_conj_left)*2+len(result_poss_left)+len(result_comma_left) #print "left:",len(result_conj_left),len(result_poss_left) result_poss_right=poss.findall(right_sent) result_conj_right=conj.findall(right_sent) result_comma_right=comma.findall(right_sent) right_start=right_start+len(result_conj_right)*2+len(result_poss_right)+len(result_comma_right) # print "right:", len(result_conj_right),len(result_poss_right),len(result_comma_right) print "after:",left_term,left_start,right_term,right_start ''' left=new_left_term+"-"+str(new_left_start) right=new_right_term+"-"+str(new_right_start) #print "=====",left,right #sent=re.sub('\-',' - ',sent) #print sent sent=re.sub('\(','LRB',sent) sent=re.sub('\)','RRB',sent) #print sent dep_parse=annotator.getAnnotations(sent, dep_parse=True)['dep_parse'] tree=annotator.getAnnotations(sent, dep_parse=True)['syntax_tree'] #print dep_parse dp_list = dep_parse.split('\n') pattern = re.compile(r'.+?\((.+?), (.+?)\)') edges = [] for dep in dp_list: #print dep m = pattern.search(dep) if m: edges.append((m.group(1), m.group(2))) graph = nx.Graph(edges) #print right if right not in graph.nodes(): print "right",left_term, right_term return "right" if left not in graph.nodes(): print "left", left_term, right_term return "left" shorttest_path=nx.shortest_path_length(graph, source=left, target=right) # print return shorttest_path
def build_d(cat_d): dir_path='lists' all_files=os.listdir(dir_path) for filename in all_files: filepath=dir_path+'/'+filename if os.path.isfile(filepath): cat=filename.strip().split('.')[0] with open(filepath,'r') as fp: for line in fp: word=line.strip() cat_d[word]=cat cat_d={} build_d(cat_d) annotator=Annotator() wordnet_lemmatizer = WordNetLemmatizer() alchemyapi = AlchemyAPI() template={} full_template=['AM-MOD','A0','AM-ADV', 'AM-NEG','V','C-V','AM-DIR','A1','A2','A3','A4','AM-PNC','AM-MNR','AM-LOC','AM-TMP','C-A1'] for item in full_template: copy_template=full_template[:] copy_template.remove(item) if 'AM-LOC' in copy_template: copy_template.remove('AM-LOC') if 'AM-TMP' in copy_template: copy_template.remove('AM-TMP') if 'AM-MNR' in copy_template: copy_template.remove('AM-MNR') template[item]=copy_template
#!/usr/bin/env python from practnlptools.tools import Annotator import sys annotator=Annotator() if __name__=='__main__': for line in sys.stdin: if line[0]=="#": continue line=line.strip() annotations=annotator.getAnnotations(line) print annotations['srl']
def update_loc(sent,left_start,right_start): annotator = Annotator() sent=re.sub('\(','LRB',sent) sent=re.sub('\)','RRB',sent) words=sent.split() #print words[left_start],words[right_start] words[left_start]=words[left_start]+'aaaaa' words[right_start]=words[right_start]+'bbbbb' # print words sent=' '.join(words) tags=annotator.getAnnotations(sent) # print "===", tags # print "chunks: ", tags['chunk'] i=0 pre_word='' pre_pre_word='' j=0 left_term = '' right_term = '' for word in tags['chunk']: i+=1 left_pattern='^(.*)aaaaa$' right_pattern='^(.*)bbbbb$' left=re.search(left_pattern,word[0]) right=re.search(right_pattern,word[0]) if left: #print "ttleft" left_term=left.group(1) left_start=i if left_term=='': left_term=pre_word left_start=left_start-1 j=1 if pre_word=='-': left_term=pre_pre_word left_start=left_start-1 if right: #print "rightright" right_term=right.group(1) right_start=i if right_term=='': right_term=pre_word right_start=right_start-1 j=2 if pre_word=='-': right_term=pre_pre_word right_start=right_start-1 pre_pre_word=pre_word pre_word=word[0] if j==1: if right_start>left_start: right_start=right_start-1 if j==2: if left_start>right_start: left_start=left_start-1 #print j #print "=++++", left_start,left_term,right_start,right_term return (left_start,left_term,right_start,right_term)