Ejemplo n.º 1
0
def get_shortest_path(a, b):
	text = a + b 

	annotator = Annotator()
	dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

	dp_list = dep_parse.split('\n')
	pattern = re.compile(r'.+?\((.+?), (.+?)\)')
	edges = []
	
	for dep in dp_list:
		m = pattern.search(dep)
		edges.append((m.group(1), m.group(2)))
	
	graph = nx.Graph(edges)  
	
	shortest_paths = [] 
	
	a = a.strip()
	b = b.strip()
	
	a = a.split()
	b = b.split()
	
	for i in a: 
		for j in b: 
			shortest_paths.append(nx.shortest_path_length(graph, source=i, target=j))
	
	print(shortest_paths)
Ejemplo n.º 2
0
def test_tree4():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    sent = "B.S. in Computer Science , a related degree or its equivalent"
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."

    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"

    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print
    print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=", tree.label()
    tree.draw()
def compute_POS(line):
    annotator = Annotator()
    pos = annotator.getAnnotations(line)['pos']
    pos_tag = []
    for p in pos:
        pos_tag.append(p[1])
    return pos_tag
Ejemplo n.º 4
0
def test_tree4():   
    
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."    
    sent = "B.S. in Computer Science , a related degree or its equivalent"    
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."    
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."
    
    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"
   
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print     
    print tree_str
    
    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=",tree.label()
    tree.draw()
def extract_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=True)

    if feature['dep_parse'] == '':
        return -2
    a = feature['dep_parse'].split('\n')
    words_data = {}
    d = []
    for i in a:
        dep = re.sub(r'^[^\(]+\(|\)$', '', i)
        try:
            p, c = dep.split(', ')
        except ValueError:
            pass
        try:
            t1 = p.split('-')
            pid = int(t1[len(t1) - 1])
            t2 = c.split('-')
            wid = int(t2[len(t2) - 1])
        except ValueError:
            if re.match('[\d]+\'', t1[len(t1) - 1]):
                pid = int(re.sub(r'\'', '', t1[len(t1) - 1])) + 0.1
                t2 = c.split('-')
                wid = int(t2[len(t2) - 1])
            elif re.match('[\d]+\'', t2[len(t2) - 1]):
                pass
            continue
        d.append((wid, pid))
    t1 = [id for id in d]
    d, _ = remove_dep(t1)
    for wid, pid in d:
        add_ids(words_data, wid, pid)
    for i in range(len(feature['words'])):
        if i + 1 not in words_data:
            words_data[i + 1] = {
                'wid': i + 1,
                'pid': -1,
                'word': feature['words'][i],
                'chk': feature['chunk'][i][1],
                'ner': feature['ner'][i][1],
                'pos': feature['pos'][i][1]
            }
        elif i + 1 in words_data:
            words_data[i + 1]['word'] = feature['words'][i]
            words_data[i + 1]['chk'] = feature['chunk'][i][1]
            words_data[i + 1]['ner'] = feature['ner'][i][1]
            words_data[i + 1]['pos'] = feature['pos'][i][1]
    words_data['syntax_tree'] = feature['syntax_tree']
    words_data['tree'] = feature['tree']
    words_data['verbs'] = feature['verbs']
    words_data['srl'] = feature['srl']
    # Global.accepted += 1
    return words_data
Ejemplo n.º 6
0
    def getword2vec(self, raw_text):
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()
        counter = 0
        doc_vec = []
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            srl = list(annotator.getAnnotations(sentence)['srl'])
            word2vec = []
            # get the event structure for each sentence
            for s in srl:
                if 'V' in s:
                    # print s['V']
                    word2vec = self.getvector(s['V'])
                    # print word2vec
                else:
                    print 'No verb found in sentence'
                    return
                if 'A0' in s:
                    # print s['A0']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A0']))

                if 'A1' in s:
                    # print s['A1']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A1']))
            if counter == 0:
                doc_vec = word2vec
            else:
                doc_vec = self.addVectors(doc_vec, word2vec)
            counter = counter + 1
Ejemplo n.º 7
0
 def annotation(self, n):
     parsed_heading = self.get_parsed_heading()
     annotator = Annotator()
     try:
         annotation = annotator.getAnnotations(parsed_heading[n])
         return annotation
     except:
         pass
def get_annotations(question):
    annotator = Annotator()
    annotations = annotator.getAnnotations(question)
    srl = annotations['srl']
    verbs = annotations['verbs']
    ner = annotations['ner']
    chunk = annotations['chunk']
    return srl, verbs, ner, chunk
Ejemplo n.º 9
0
def test_deep():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent, dep_parse=True)

    print result['dep_parse']
Ejemplo n.º 10
0
def test1():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    result = annotator.getAnnotations(sent)

    #print result
    print type(result)
    print result.keys()
Ejemplo n.º 11
0
 def srl(self, sen_dest):
     ant = Annotator()
     if sen_dest.upper().split()[0] == "UP":
         v = sen_dest.upper().split()
         v[0] = "RAISE"
         sen_dest = str(v)
     sen_srl = ant.getAnnotations(sen_dest)['syntax_tree']
     return sen_srl
def detector(mail,outputfileName):

	annotator=Annotator()
	sentences = nltk.sent_tokenize(mail)
	probableActionItemSentences = []
	for sentence in sentences:
		text = nltk.word_tokenize(sentence)
		posTags = nltk.pos_tag(text)
		for tags in posTags:
			if tags[1]=="VB":
				probableActionItemSentences.append(sentence)
				break
	for sentence in probableActionItemSentences:
		srLabels = annotator.getAnnotations(sentence)['srl']
		#print(srLabels)
		depParsedContent = annotator.getAnnotations(sentence,dep_parse=True)
		#print(depParsedContent)
		root = depParsedContent['dep_parse']
		root = root[root.find('root('):]
		root = root[:root.find('\n')]
		root = root[root.find(',')+2:root.rfind('-')]
		parsedList = depParsedContent['srl']
		owner = None
		ownerFound = False
		for parsedMap in parsedList:
			if 'V' in parsedMap and parsedMap['V'] == root:
				if 'A0' in parsedMap:
					owner = parsedMap['A0']
					ownerFound = True
				else:
					owner = 'You'
					ownerFound = True
				break
		if not ownerFound:
			for parsedMap in parsedList:
				if 'A0' in parsedMap:
					ownerFound = True
					if parsedMap['A0'].lower() == 'you' or parsedMap['A0'].lower() == 'we' or parsedMap['A0'].lower() == 'us':
						owner = parsedMap['A0']
						ownerFound = True
						break
		if ownerFound and owner==None:
			print("")
		else:
			
			outputfile.write("OWNER : "+owner+" SENTENCE : "+sentence+"\n")
Ejemplo n.º 13
0
def test1():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    result = annotator.getAnnotations(sent)

    #print result
    print type (result)
    print  result.keys()   
Ejemplo n.º 14
0
def test_deep():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent,dep_parse=True)

    print result['dep_parse']
Ejemplo n.º 15
0
def complexQuery(term):
    #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query="+term+"&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
    #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="+term))

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(term, dep_parse=True)['dep_parse']
    dp_list = dep_parse.split('\n')

    #spotlightTerms = WordNet.spotlightSearch(term)
    #print "spotlight terms %s" %spotlightTerms
    #print "dp list  %s" %dp_list

    spotlightTerms = spotlight.annotate(
        'http://spotlight.sztaki.hu:2222/rest/annotate',
        term,
        confidence=0.3,
        support=20,
        spotter='Default')
    #print term, '\t', spotlightTerms[1].get('URI')
    #print spotlightTerms[0].get('URI')
    secondDep = ""
    query = []

    for prep in dp_list:
        elementPrep = "prep"
        if elementPrep in prep:
            print("We found preposition1: %s" %
                  prep[prep.find("_") + 1:prep.find("(")])
            prepType = prep[prep.find("_") + 1:prep.find("(")]
            print("We found preposition2: %s" %
                  prep[prep.find(" ") + 1:prep.find(")")])
            secondDep = prep[prep.find(" ") + 1:prep.find(")")].split("-")
            print secondDep[0]
            query.append(prepType)
            query.append(secondDep[0])
            if prepType == "like":
                results = DBPedia.dpbediaQuery(prepType, secondDep[0])
            else:
                results = DBPedia.dpbediaQuery(prepType,
                                               spotlightTerms[1].get('URI'))
            print results

    for query in results:

        test = json.load(
            urllib2.urlopen(
                "http://www.freesound.org/apiv2/search/text/?query=" + query +
                "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
        test2 = json.load(
            urllib2.urlopen(
                "https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="
                + query))

    print(test)
    #print(test2)

    return test, test2
Ejemplo n.º 16
0
def semantic_role_label():
    lemmatizer = WordNetLemmatizer()
    verbs_target = ["ensnare", "infect", "use", "target"]
    verbs_tool = ["attack"]

    # sent = "Mirai-Based Masuta Botnet Weaponizes Old Router Vulnerability. By Ionut Arghire on January 24, 2018. inShare. A new piece of malware based on Mirai's publicly released source code has been observed at large, ensnaring devices into a botnet, targeted Internet of Things. Known as Masuta, the botnet has at least two variants at large, and is believed to be the work of a well-known IoT threat actor, NewSky Security says. What?s also unique to the botnet is that it exploits an old router vulnerability, being the first threat known to weaponize it in a botnet campaign. Masuta (Japanese for master) botnet?s source code was found on an invite only dark forum. The malware?s configuration file, the researchers discovered, uses a different seed of the cipher key compared to Mirai, having the strings in the configuration files XORed by 0x45. Thus, the researchers discovered that it uses the domain nexusiotsolutions(dot)net, the command and control (C&C) server that Nexus Zeta, the individual involved in the recent Satori attacks, uses. The domain was registered using the nexuszeta1337@gmail(.)com email address. Thus, NewSky Security suggests that Nexus Zeta has been involved in the creation of the Masuta botnet, in addition to building Satori, the Mirai variant that has been wreaking havoc over the past couple of months. In fact, Masuta isn?t new either, and attacks involving it have been steadily increasing since September, and the botnet?s standard variant has been observed using several known/weak/default credentials to compromise IoT devices. An evolved variant of Masuta, called PureMasuta, contains the most typical of Mirai style code, and a list of weak credentials to use. What makes this malware variant stand out, however, is its usage of EDB 38722 D-Link exploit. The exploit PureMasuta uses resides in the HNAP (Home Network Administration Protocol), which is based on the SOAP protocol. It is possible to craft a SOAP query to bypass authentication by using hxxp://purenetworks.com/HNAP1/GetDeviceSettings, and improper string handling can lead to arbitrary code execution, and an attacker can abuse this combination of issues to run code on targeted devices. What the botnet does is to download a shell script from the C&C server and run it. Thus, the malware author first bypasses authentication and then executes code on the targeted devices. The PureMasuta variant uses the same C&C server (93.174.93.63) as the original Masuta variant, which led the researchers to believe it is the evolved creation of the same threat actor. Nexus Zeta is no stranger when it comes to implementing SOAP related exploits. The threat actor has already been observed in implementing two other known SOAP related exploits, CVE-2014-8361 and CVE-2017-17215 in his Satori botnet project, NewSky Security notes. Thus, the TR-069 bug and EDB 38722 are the third and fourth SOAP related exploits abused by IoT botnets. Protocol exploits are more desirable for threat actors as they usually have a wider scope. A protocol can be implemented by various vendors/models and a bug in the protocol itself can get carried on to a wider range of devices, the researchers conclude."

    # sent = "Mirai, the infamous botnet used in the recent massive distributed denial of service (DDoS) attacks against Brian Krebs' blog and Dyn's DNS infrastructure, has ensnared Internet of Things (IoT) devices in 164 countries, researchers say."

    if len(sys.argv) != 2:
        print("NOPE")
        exit()

    fh = open(sys.argv[1], "r")
    sent = fh.read()
    # sent = sys.argv[1]

    target = ""
    tools = ""

    for s in [i.strip() for i in sent.split(".")]:
        a = Annotator()
        b = a.getAnnotations(s.encode('utf-8'))

        dictlist = b['srl']
        for dict in dictlist:
            if 'V' in dict:
                if lemmatizer.lemmatize(dict["V"].lower(),
                                        'v') in verbs_target:
                    temp1 = temp2 = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp1 += dict['A0']
                    # temp += " :|: "
                    if "A1" in dict and not dict['A1'] == "":
                        temp2 += dict['A1']
                    if not temp1 == "":
                        temp1 = getTools(temp1)
                        tools += temp1 + ":-----:"
                    if not temp2 == "":
                        temp2 = getTargets(temp2)
                        target += temp2 + ":-----:"
                if lemmatizer.lemmatize(dict["V"].lower(), 'v') in verbs_tool:
                    temp = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp += dict['A0']
                    # temp += "|"
                    # if "A1" in dict:
                    #     temp += dict['A1']
                    if not temp == "":
                        temp = getTools(temp)
                        tools += temp + ":-----:"

        # print("SemanticRoleLabel :::: {}".format(b['srl']))
        # print("2nd:\n{}".format([x, y] for x,y in [b['ner']]))

    print("Target :::: " + target)
    print("Tools :::: " + tools)
def compute_Chunks(corpus):
      Chunk_Tags=[]
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            Chunk_Tags.append(chunk)
      return Chunk_Tags
def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        Chunk_Tags.append(chunk)
    return Chunk_Tags
Ejemplo n.º 19
0
    def SRLAnnotation( self, sentence ):
        """
        Use SENNA library to perform SRL(semantic role labelling) on specific sentence.

        :param sentence: the specific sentence to be handled
        :type sentence: str
        :return:
        :rtype: list({})
        """
        annotator = Annotator()
        return annotator.getAnnotations( sentence )["srl"]
Ejemplo n.º 20
0
def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    #for sentence in corpus:
    chunks = annotator.getAnnotations(corpus)['chunk']
    chunk = ""
    for elem in chunks:
        chunk = chunk + elem[1] + " "
    # print chunk  To see what these chucks are
    Chunk_Tags.append(chunk)
    return Chunk_Tags
Ejemplo n.º 21
0
def test3():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent)

    print result["pos"]
    print result['ner']
    print result['chunk']
    print result['verbs']
    print result['srl']
Ejemplo n.º 22
0
def test3():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent)

    print result["pos"]
    print result['ner']
    print result['chunk']
    print result['verbs']
    print result['srl']
Ejemplo n.º 23
0
def draw_tree():

    annotator = Annotator()
    result = annotator.getAnnotations(sent11)
    tree_str = result['syntax_tree']
    print
    # print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print tree.pprint()
    print
    print "Root label=", tree.label()
    tree.draw()
Ejemplo n.º 24
0
def compute_NER(corpus):
    NER = []
    fi = open(read_property('NER_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        ners = annotator.getAnnotations(sentence)['ner']
        ner = ""
        for elem in ners:
            ner = ner + elem[1] + " "
        print ner
        fi.write(ner + "\n")
        NER.append(ner)
    return NER
def compute_Chunks(corpus):
      #Chunk_Tags=[]
      fi=open(read_property('Chunk_features_train_path'),"w")
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            #print chunk
	    fi.write(chunk+"\n")
            #Chunk_Tags.append(chunk)
      #print "The bag of words for Chunks is ",Chunk_Tags
      fi.close()
Ejemplo n.º 26
0
def compute_Chunks(corpus):
    #Chunk_Tags=[]
    fi = open(read_property('Chunk_features_test_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        print chunk
        fi.write(chunk + "\n")
        #Chunk_Tags.append(chunk)
    #print "The bag of words for Chunks is ",Chunk_Tags
    fi.close()
Ejemplo n.º 27
0
def draw_tree():   
    
    
    annotator=Annotator()
    result = annotator.getAnnotations(sent11)
    tree_str = result['syntax_tree']
    print     
   # print tree_str
    
    tree = Tree.fromstring(tree_str)[0]
    print tree.pprint()
    print
    print "Root label=",tree.label()
    tree.draw()
Ejemplo n.º 28
0
class srlGraph(Base):
    def __init__(self, docpath):
        super(srlGraph, self).__init__(docpath)
        self.stopwords = stopwords.words("english")
        self.annotator = Annotator()

    def srl_corpus_extraction(self, stopwords=None):

        sem_rl = self.annotator.getAnnotations(self.corpus,
                                               dep_parse=True)['srl']
        srl_corpus = [
            self.annotator.getAnnotations(doc)['srl'] for doc in self.corpus
        ]
        return srl_corpus

    def srl_document_extraction(self, document_id=0, stopwords=None):
        return self.annotator.getAnnotations(self.corpus[document_id],
                                             dep_parse=True)['srl']

    def get_doc_canon(self, document_id):
        sem_rl = self.annotator.getAnnotations(self.corpus[document_id],
                                               dep_parse=True)['srl']
        canon = [
            en.singularize(word)
            for word in str(TextBlob(sem_rl[0]['C-A1'])).split()
            if word not in stopwords
        ]

        blob = TextBlob(sem_rl[0]['A1'])
        nounPhrases = blob.noun_phrases.singularize()
        sr_verb_concept = self.annotator.getAnnotations(sem_rl[0]['A1'])['srl']

        concat_noun_concepts = set(
            sum([word.split() for word in nounPhrases], []))
        predicates = list(set(canon) - concat_noun_concepts)

        return canon
Ejemplo n.º 29
0
def compute_POS_Tags(corpus):
    #POS=[]
    fi = open(read_property('POS_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        pos_seq = annotator.getAnnotations(sentence)['pos']
        #print pos_seq
        pos_tags = ""
        for pos in pos_seq:
            pos_tags = pos_tags + pos[1] + " "
        fi.write(pos_tags + "\n")
        print pos_tags  ###############
        #POS.append(pos_tags)
    #print "The bag of words of POS is ",POS
    fi.close()
Ejemplo n.º 30
0
class Worker_NLP:
    '''
    worker class for NLP
    '''
    nThreads = 4
    pool = None
    ith = 0
    tokenizer_sent = None
    practNLP_annotator = None
    sent_tokenizers = None
    annotators = None

    def __init__(self):

        self.practNLP_annotator = Annotator()
        self.tokenizer_sent = nltk.tokenize.sent_tokenize
        self.pool = Pool(self.nThreads)
        self.sent_tokenizers = []
        self.annotators = []
        for i in xrange(self.nThreads):
            self.sent_tokenizers.append(nltk.tokenize.sent_tokenize)
            self.annotators.append(Annotator())

    def getSRL(self, rawTxt):
        sents = sent_tokenize(rawTxt)
        srls = []
        for sent in sents:
            srl = self.practNLP_annotator.getAnnotations(sent)['srl']
            srls.append(srl)
        return srls

    def getSRL_parallel(self, rawTxts):
        tasks = []  #[rawTxt, tokenizer, annotator]
        for i, rawTxt in enumerate(rawTxts):
            tokenizer_tmp = self.sent_tokenizers[self.ith]
            annotator_tmp = self.annotators[self.ith]
            self.ith = (self.ith + 1) % self.nThreads
            tasks.append((rawTxt, tokenizer_tmp, annotator_tmp))

        results = []
        for task in tasks:
            result = self.pool.apply_async(compute_task_atom, args=task)
            results.append(result)

        srls_results = [item.get() for item in results]
        return srls_results
Ejemplo n.º 31
0
 def readfile(self, num_of_lines):
     n = 0
     annotator = Annotator()
     with open(self.path + self.file_name) as f:
         reader = csv.reader(f)
         for l in reader:
             if n < num_of_lines:
                 line = l[0].split("\t")
                 heading = line[0]
                 true_passage = line[1]
                 false_passage = line[2:]
                 document = Document(heading, true_passage, false_passage)
                 anno = annotator.getAnnotations(
                     document.get_parsed_heading()[0])
                 pprint(anno)
                 n += 1
             else:
                 break
Ejemplo n.º 32
0
    def get_filtered_events(self, doc_id, raw_text, entity):

        filtered_events = []
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()

        # for each sentence determine if it pertains to the entity
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            events = self.get_filtered_verbs_with_vecs(
                doc_id, list(annotator.getAnnotations(sentence)['srl']),
                entity)
            # print events.__len__()
            if events.__len__() > 0:
                filtered_events.extend(events)

        # return list of events
        return filtered_events
Ejemplo n.º 33
0
def test_tree2():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print result['syntax_tree']
    print "--------------------"
    tree2 = Tree(tree_str)
    print len(tree2)
    print "--------------------"

    for item in tree2[0]:
        print type(item)
        print item
Ejemplo n.º 34
0
def test_tree2():
    
    
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."    
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print result['syntax_tree']
    print "--------------------"
    tree2 = Tree(tree_str)
    print len(tree2)   
    print "--------------------"
    
    for item in tree2[0]:
        print type(item)
        print item
Ejemplo n.º 35
0
def extract_wpcn_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=False)
    words_data = {}
    for i in range(len(feature['words'])):
        words_data[i + 1] = {
            'wid': i + 1,
            'word': feature['words'][i],
            'chk': feature['chunk'][i][1],
            'ner': feature['ner'][i][1],
            'pos': feature['pos'][i][1]
        }
    return words_data
Ejemplo n.º 36
0
def check_neg(text, keyword):

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

    dp_list = dep_parse.split('\n')
    pattern = re.compile(r'.+?\((.+?), (.+?)\)')

    edges = []
    for dep in dp_list:
        m = pattern.search(dep)
        word1 = m.group(1).split('-')[0]
        word2 = m.group(2).split('-')[0]
        # print word1, word2
        if (word1 == keyword and word2 in neg_words) or (word1 in neg_words
                                                         and word2 == keyword):
            return 1

    return 0
Ejemplo n.º 37
0
    def process(self, text):
        annotator = Annotator()
        tokens = annotator.getAnnotations(text)
        chunks = tokens['chunk']
        ner = tokens['ner']
        pos = tokens['pos']
        srl = tokens['srl']
        words = tokens['words']

        for part in srl:
            self.assign_objects(part)
        
        returnCode = ""
        print self.objects
        for object in self.objects:
            print self.get_item(object)
            if self.get_item(object) is not None:
                returnCode += str(self.get_item(object))

        return returnCode
Ejemplo n.º 38
0
def preprocess(infile, outfile, posfile, index):
    annotator = Annotator()
    wnl = WordNetLemmatizer()
    o = open(outfile, 'w'); p = open(posfile, 'w'); f = open(infile)
    text = f.readlines()
    for s in text:
        s = s.strip().split("\t", index)
        # make it lower
        sent = s[index].lower()
        # remove special characters
        sent = sent.strip(string.punctuation)
        # extend contractions
        sent = re.sub(r"n't", " not", sent)
        sent = re.sub(r"'ve", " have", sent)
        sent = re.sub(r"'d", " would", sent)
        sent = re.sub(r"'ll", " will", sent)
        sent = re.sub(r"'m", " am", sent)
        sent = re.sub(r"'s", " is", sent)
        sent = re.sub(r"'re", " are", sent)

        # lematize and get POS tags
        pos = annotator.getAnnotations(sent)["pos"]
        lemmas = [wnl.lemmatize(w,'v') if t.startswith('V') else wnl.lemmatize(w, 'n') for (w,t) in pos]
        sent = " ".join(lemmas)
        pos = " ".join([x[1] for x in pos])

        out_string = ""
        pos_string = ""
        for j in range(0,index):
            out_string += s[j] + "\t"
            pos_string += s[j] + "\t"

        out_string += sent + "\n"
        pos_string += pos + "\n"
        o.write(out_string)
        p.write(pos_string)

    f.close()
    o.close()
    o.close()
Ejemplo n.º 39
0
def generate_shortestpath (sent,left_term,left_start,right_term,right_start):
    annotator = Annotator()
    #print "========before:",left_term,left_start,right_term,right_start

    '''
    left_sent=" ".join(sent.split()[:left_start])
    right_sent=" ".join(sent.split()[:right_start])
    if re.search("[A-Za-z]-[A-Za-z]",left_term):
        info=left_term.split("-")
        left_term=info[-1]
        #left_start=left_start-1
    if re.search("[A-Za-z]-[A-Za-z]",right_term):
        info=right_term.split("-")
        right_term=info[-1]
       # right_start=right_start-1
    '''
    right_term=re.sub('\(','LRB',right_term)
    right_term=re.sub('\)','RRB',right_term)
    left_term=re.sub('\(','LRB',left_term)
    left_term=re.sub('\)','RRB',left_term)
    new_left_start,new_left_term,new_right_start,new_right_term=update_loc(sent,left_start,right_start)



    '''
    # adjust start coordination # denpendency parser wil split " - " and "'s"

   # print "left: ", left_sent
   # print "right: ",right_sent

    poss=re.compile('\w+\'s')
    conj=re.compile('[A-Za-z]+\-[A-Za-z]+')
    comma=re.compile('\w,')

    result_poss_left=poss.findall(left_sent)
    result_conj_left=conj.findall(left_sent)
    result_comma_left=comma.findall(left_sent)
    left_start=left_start+len(result_conj_left)*2+len(result_poss_left)+len(result_comma_left)
    #print "left:",len(result_conj_left),len(result_poss_left)

    result_poss_right=poss.findall(right_sent)
    result_conj_right=conj.findall(right_sent)
    result_comma_right=comma.findall(right_sent)
    right_start=right_start+len(result_conj_right)*2+len(result_poss_right)+len(result_comma_right)
   # print "right:", len(result_conj_right),len(result_poss_right),len(result_comma_right)

    print "after:",left_term,left_start,right_term,right_start
    '''

    left=new_left_term+"-"+str(new_left_start)
    right=new_right_term+"-"+str(new_right_start)
    #print "=====",left,right
    #sent=re.sub('\-',' - ',sent)
    #print sent

    sent=re.sub('\(','LRB',sent)
    sent=re.sub('\)','RRB',sent)

    #print sent
    dep_parse=annotator.getAnnotations(sent, dep_parse=True)['dep_parse']
    tree=annotator.getAnnotations(sent, dep_parse=True)['syntax_tree']
    #print dep_parse
    dp_list = dep_parse.split('\n')
    pattern = re.compile(r'.+?\((.+?), (.+?)\)')
    edges = []
    for dep in dp_list:

        #print dep

        m = pattern.search(dep)
        if m:
            edges.append((m.group(1), m.group(2)))
    graph = nx.Graph(edges)
    #print right
    if right not in graph.nodes():
        print "right",left_term, right_term
        return "right"
    if left not in graph.nodes():
        print "left", left_term, right_term
        return "left"
    shorttest_path=nx.shortest_path_length(graph, source=left, target=right)
   # print
    return  shorttest_path
Ejemplo n.º 40
0
from practnlptools.tools import Annotator

annotator = Annotator()

fp = open("input_captions.txt", "r")
fw = open("output_captions.txt", "w")
data = fp.read().strip().split("\n")

for caption in data:
    out = annotator.getAnnotations(caption, dep_parse=True)
    pos = out['pos']
    ner = out['ner']
    srl = out['srl']

    replace_phrase = ''
    for i in srl:
        if 'A0' in i:
            replace_phrase = i['A0']
        elif 'A1' in i:
            replace_phrase = i['A1']
        break

    if len(replace_phrase) == 0:
        continue

    if caption.startswith(replace_phrase) == False:
        continue

    filler1 = 'What'
    filler2 = 'What is'
    filler3 = 'Who'
Ejemplo n.º 41
0
writer.writerow(('A0', 'A1', 'V', 'fileName'))
for filename in os.listdir(path):
    print 'reading', filename
    text_file = open(path + filename, "r")
    file_content = text_file.readlines()
    data.append((filename, file_content))
    text_file.close()

    #Annotating the striped text

    for t1 in file_content:
        for text in t1.split("."):
            text.strip()
            if text:
                try:
                    x = annotator.getAnnotations(text)['srl']
                    for x_tuple in x:
                        a0 = None
                        a1 = None
                        v = None
                        for item in x_tuple:
                            if item == 'A0':
                                a0 = x_tuple['A0']
                            if item == 'A1':
                                a1 = x_tuple['A1']
                            if item == 'V':
                                if stemmer.stem(
                                        x_tuple['V']) in stemmed_desired_verbs:
                                    v = x_tuple['V']
                        if (a0 is not None and a1 is not None
                                and v is not None):
Ejemplo n.º 42
0
import bllipparser
from bllipparser import RerankingParser
from nltk import Tree
from practnlptools.tools import Annotator

score = 0


annotator = Annotator()

rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)

query = "Describe steps taken and worldwide reaction prior to introduction of the Euro on January 1, 1999."
candidate = "Europe's new currency, the euro, will rival the U.S. dollar as an international currency over the long term, Der Speigel magazine reported Sunday."

qListOfDict = annotator.getAnnotations(query)['srl']
cListOfDict = annotator.getAnnotations(candidate)['srl']


qParsed = ['(S1 ']
cParsed = ['(S1 ']

for list in qListOfDict:
	






Ejemplo n.º 43
0
writer.writerow( ('A0', 'A1', 'V', 'fileName'))
for filename in os.listdir(path):
    print 'reading', filename
    text_file = open(path + filename,"r")
    file_content = text_file.readlines()
    data.append((filename,file_content))
    text_file.close()

#Annotating the striped text
    
    for t1 in file_content:
        for text in t1.split("."):
            text.strip()
            if text:
                try:
                    x = annotator.getAnnotations(text)['srl']
                    for x_tuple in x:
                        a0 = None
                        a1 = None
                        v = None 
                        for item in x_tuple:
                            if item == 'A0':
                                a0 = x_tuple['A0']
                            if item == 'A1':
                                a1 = x_tuple['A1']
                            if item == 'V':
                                if stemmer.stem(x_tuple['V']) in stemmed_desired_verbs:
                                    v = x_tuple['V']
                        if (a0 is not None and a1 is not None and v is not None):
                            writer.writerow( (x_tuple['A0'], x_tuple['A1'], x_tuple['V'], filename))
                except Exception as e:
Ejemplo n.º 44
0
#!/usr/bin/env python
from practnlptools.tools import Annotator
import sys
annotator=Annotator()

if __name__=='__main__':
    for line in sys.stdin:
        if line[0]=="#":
            continue
        line=line.strip()
        annotations=annotator.getAnnotations(line)
        print annotations['srl']
Ejemplo n.º 45
0
def printQ(questions,line,i):
    for Qtype in questions:
        for question_answer in questions[Qtype]:
            print line[0]
            print question_answer[0].lower() + " " +"?"
            print question_answer[1].lower()
            print Qtype
            print

if __name__=='__main__':
    for (i,line) in enumerate(sys.stdin,1):
        if line[0]=="#":
            continue
        line=line.strip().rstrip('.').split('\t')
        if len(line)>1:
            annotations=annotator.getAnnotations(line[1])
        else:
            annotations=annotator.getAnnotations(line[0])
        srl=annotations['srl']
        pos=annotations['pos']
        ner=annotations['ner']
        if not srl:
            #print 'semantic role labeling failed'
            continue
        #print srl
        pos_d=dict(pos)
        questions=defaultdict(list)
        for item in srl:
            generate(item,questions)
        printQ(questions,line,i)
    log.close()
Ejemplo n.º 46
0
__author__ = 'hz'
from practnlptools.tools import Annotator

text = "Disclosed is an organic light-emitting diode (OLED) display panel. An OLED display panel includes a plurality of signal lines and a thin film transistor formed on a substrate, an interlayer insulating layer, a first electrode, a bank, an organic light-emitting layer, a second electrode, a first passivation layer, an organic layer, a second passivation layer and a barrier film, wherein the bank is formed to completely cover the interlayer insulating layer, and an inclination formed by side surfaces of the bank and the interlayer insulating layer is made to be gradual."

# text = "Disclosed is an organic light-emitting diode (OLED) display panel."
# semantic role labelling
text = 'Unlike the classic PPP technique, in our new approach, the ionospheric-free linear combination is not used.'
annotator = Annotator()
result = annotator.getAnnotations( text )["srl"]

print( type(result) )
print( result )
Ejemplo n.º 47
0
'''
Python Script for SENNA functionality
Uses PracNLPTools for Sementic Role Labeling
NOT USED 
'''

import csv
import re
from practnlptools.tools import Annotator
annotator=Annotator()

print("Running Shallow Semantic Parser")
patternForSymbol = re.compile(r'(\ufeff)', re.U)
comments=[]
#reads in CSV file
with open('Dataset/dataset.csv','rb') as dataFile:
    reader = csv.reader(dataFile, delimiter=',')
    for row in reader:
        #row[0] = row[0].decode('utf-8')
        rowEdited = re.sub(patternForSymbol, '', row[0])
        comment = rowEdited if rowEdited != "" else row[0]
        sentiment = row[1]
        comments.append((comment, sentiment))


for index,comment in enumerate(comments):
	if(index<100):
		print comment[0]
		print(annotator.getAnnotations(comment[0])['srl'])
		print("==========================")
Ejemplo n.º 48
0
def update_loc(sent,left_start,right_start):
    annotator = Annotator()
    sent=re.sub('\(','LRB',sent)
    sent=re.sub('\)','RRB',sent)
    words=sent.split()
    #print words[left_start],words[right_start]
    words[left_start]=words[left_start]+'aaaaa'
    words[right_start]=words[right_start]+'bbbbb'
  #  print words

    sent=' '.join(words)
    tags=annotator.getAnnotations(sent)
   # print "===", tags

   # print "chunks:      ", tags['chunk']
    i=0
    pre_word=''
    pre_pre_word=''
    j=0
    left_term = ''
    right_term = ''


    for word in tags['chunk']:
        i+=1
        left_pattern='^(.*)aaaaa$'
        right_pattern='^(.*)bbbbb$'
        left=re.search(left_pattern,word[0])
        right=re.search(right_pattern,word[0])


        if left:
            #print "ttleft"
            left_term=left.group(1)
            left_start=i
            if left_term=='':
                left_term=pre_word
                left_start=left_start-1
                j=1
                if pre_word=='-':
                    left_term=pre_pre_word
                    left_start=left_start-1


        if right:
            #print "rightright"
            right_term=right.group(1)
            right_start=i
            if right_term=='':
                right_term=pre_word
                right_start=right_start-1
                j=2
                if pre_word=='-':
                    right_term=pre_pre_word
                    right_start=right_start-1

        pre_pre_word=pre_word
        pre_word=word[0]
    if j==1:
        if right_start>left_start:
            right_start=right_start-1
    if j==2:
        if left_start>right_start:
            left_start=left_start-1
    #print j
    #print "=++++", left_start,left_term,right_start,right_term
    return (left_start,left_term,right_start,right_term)
Ejemplo n.º 49
0
def test2(): 
    sent = "There are people dying make this world a better place for you and for me."
    
    annotator=Annotator()    
    result = annotator.getAnnotations(sent,dep_parse=True)
    print  result