def extract_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=True)

    if feature['dep_parse'] == '':
        return -2
    a = feature['dep_parse'].split('\n')
    words_data = {}
    d = []
    for i in a:
        dep = re.sub(r'^[^\(]+\(|\)$', '', i)
        try:
            p, c = dep.split(', ')
        except ValueError:
            pass
        try:
            t1 = p.split('-')
            pid = int(t1[len(t1) - 1])
            t2 = c.split('-')
            wid = int(t2[len(t2) - 1])
        except ValueError:
            if re.match('[\d]+\'', t1[len(t1) - 1]):
                pid = int(re.sub(r'\'', '', t1[len(t1) - 1])) + 0.1
                t2 = c.split('-')
                wid = int(t2[len(t2) - 1])
            elif re.match('[\d]+\'', t2[len(t2) - 1]):
                pass
            continue
        d.append((wid, pid))
    t1 = [id for id in d]
    d, _ = remove_dep(t1)
    for wid, pid in d:
        add_ids(words_data, wid, pid)
    for i in range(len(feature['words'])):
        if i + 1 not in words_data:
            words_data[i + 1] = {
                'wid': i + 1,
                'pid': -1,
                'word': feature['words'][i],
                'chk': feature['chunk'][i][1],
                'ner': feature['ner'][i][1],
                'pos': feature['pos'][i][1]
            }
        elif i + 1 in words_data:
            words_data[i + 1]['word'] = feature['words'][i]
            words_data[i + 1]['chk'] = feature['chunk'][i][1]
            words_data[i + 1]['ner'] = feature['ner'][i][1]
            words_data[i + 1]['pos'] = feature['pos'][i][1]
    words_data['syntax_tree'] = feature['syntax_tree']
    words_data['tree'] = feature['tree']
    words_data['verbs'] = feature['verbs']
    words_data['srl'] = feature['srl']
    # Global.accepted += 1
    return words_data
Ejemplo n.º 2
0
def test_tree4():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    sent = "B.S. in Computer Science , a related degree or its equivalent"
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."

    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"

    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print
    print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=", tree.label()
    tree.draw()
Ejemplo n.º 3
0
    def getword2vec(self, raw_text):
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()
        counter = 0
        doc_vec = []
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            srl = list(annotator.getAnnotations(sentence)['srl'])
            word2vec = []
            # get the event structure for each sentence
            for s in srl:
                if 'V' in s:
                    # print s['V']
                    word2vec = self.getvector(s['V'])
                    # print word2vec
                else:
                    print 'No verb found in sentence'
                    return
                if 'A0' in s:
                    # print s['A0']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A0']))

                if 'A1' in s:
                    # print s['A1']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A1']))
            if counter == 0:
                doc_vec = word2vec
            else:
                doc_vec = self.addVectors(doc_vec, word2vec)
            counter = counter + 1
Ejemplo n.º 4
0
def get_shortest_path(a, b):
	text = a + b 

	annotator = Annotator()
	dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

	dp_list = dep_parse.split('\n')
	pattern = re.compile(r'.+?\((.+?), (.+?)\)')
	edges = []
	
	for dep in dp_list:
		m = pattern.search(dep)
		edges.append((m.group(1), m.group(2)))
	
	graph = nx.Graph(edges)  
	
	shortest_paths = [] 
	
	a = a.strip()
	b = b.strip()
	
	a = a.split()
	b = b.split()
	
	for i in a: 
		for j in b: 
			shortest_paths.append(nx.shortest_path_length(graph, source=i, target=j))
	
	print(shortest_paths)
Ejemplo n.º 5
0
def test_tree4():   
    
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."    
    sent = "B.S. in Computer Science , a related degree or its equivalent"    
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."    
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."
    
    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"
   
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print     
    print tree_str
    
    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=",tree.label()
    tree.draw()
def compute_POS(line):
    annotator = Annotator()
    pos = annotator.getAnnotations(line)['pos']
    pos_tag = []
    for p in pos:
        pos_tag.append(p[1])
    return pos_tag
Ejemplo n.º 7
0
def test1():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    result = annotator.getAnnotations(sent)

    #print result
    print type (result)
    print  result.keys()   
Ejemplo n.º 8
0
def test_deep():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent,dep_parse=True)

    print result['dep_parse']
Ejemplo n.º 9
0
 def srl(self, sen_dest):
     ant = Annotator()
     if sen_dest.upper().split()[0] == "UP":
         v = sen_dest.upper().split()
         v[0] = "RAISE"
         sen_dest = str(v)
     sen_srl = ant.getAnnotations(sen_dest)['syntax_tree']
     return sen_srl
def get_annotations(question):
    annotator = Annotator()
    annotations = annotator.getAnnotations(question)
    srl = annotations['srl']
    verbs = annotations['verbs']
    ner = annotations['ner']
    chunk = annotations['chunk']
    return srl, verbs, ner, chunk
Ejemplo n.º 11
0
def test1():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    result = annotator.getAnnotations(sent)

    #print result
    print type(result)
    print result.keys()
Ejemplo n.º 12
0
 def annotation(self, n):
     parsed_heading = self.get_parsed_heading()
     annotator = Annotator()
     try:
         annotation = annotator.getAnnotations(parsed_heading[n])
         return annotation
     except:
         pass
Ejemplo n.º 13
0
def test_deep():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent, dep_parse=True)

    print result['dep_parse']
Ejemplo n.º 14
0
def semantic_role_label():
    lemmatizer = WordNetLemmatizer()
    verbs_target = ["ensnare", "infect", "use", "target"]
    verbs_tool = ["attack"]

    # sent = "Mirai-Based Masuta Botnet Weaponizes Old Router Vulnerability. By Ionut Arghire on January 24, 2018. inShare. A new piece of malware based on Mirai's publicly released source code has been observed at large, ensnaring devices into a botnet, targeted Internet of Things. Known as Masuta, the botnet has at least two variants at large, and is believed to be the work of a well-known IoT threat actor, NewSky Security says. What?s also unique to the botnet is that it exploits an old router vulnerability, being the first threat known to weaponize it in a botnet campaign. Masuta (Japanese for master) botnet?s source code was found on an invite only dark forum. The malware?s configuration file, the researchers discovered, uses a different seed of the cipher key compared to Mirai, having the strings in the configuration files XORed by 0x45. Thus, the researchers discovered that it uses the domain nexusiotsolutions(dot)net, the command and control (C&C) server that Nexus Zeta, the individual involved in the recent Satori attacks, uses. The domain was registered using the nexuszeta1337@gmail(.)com email address. Thus, NewSky Security suggests that Nexus Zeta has been involved in the creation of the Masuta botnet, in addition to building Satori, the Mirai variant that has been wreaking havoc over the past couple of months. In fact, Masuta isn?t new either, and attacks involving it have been steadily increasing since September, and the botnet?s standard variant has been observed using several known/weak/default credentials to compromise IoT devices. An evolved variant of Masuta, called PureMasuta, contains the most typical of Mirai style code, and a list of weak credentials to use. What makes this malware variant stand out, however, is its usage of EDB 38722 D-Link exploit. The exploit PureMasuta uses resides in the HNAP (Home Network Administration Protocol), which is based on the SOAP protocol. It is possible to craft a SOAP query to bypass authentication by using hxxp://purenetworks.com/HNAP1/GetDeviceSettings, and improper string handling can lead to arbitrary code execution, and an attacker can abuse this combination of issues to run code on targeted devices. What the botnet does is to download a shell script from the C&C server and run it. Thus, the malware author first bypasses authentication and then executes code on the targeted devices. The PureMasuta variant uses the same C&C server (93.174.93.63) as the original Masuta variant, which led the researchers to believe it is the evolved creation of the same threat actor. Nexus Zeta is no stranger when it comes to implementing SOAP related exploits. The threat actor has already been observed in implementing two other known SOAP related exploits, CVE-2014-8361 and CVE-2017-17215 in his Satori botnet project, NewSky Security notes. Thus, the TR-069 bug and EDB 38722 are the third and fourth SOAP related exploits abused by IoT botnets. Protocol exploits are more desirable for threat actors as they usually have a wider scope. A protocol can be implemented by various vendors/models and a bug in the protocol itself can get carried on to a wider range of devices, the researchers conclude."

    # sent = "Mirai, the infamous botnet used in the recent massive distributed denial of service (DDoS) attacks against Brian Krebs' blog and Dyn's DNS infrastructure, has ensnared Internet of Things (IoT) devices in 164 countries, researchers say."

    if len(sys.argv) != 2:
        print("NOPE")
        exit()

    fh = open(sys.argv[1], "r")
    sent = fh.read()
    # sent = sys.argv[1]

    target = ""
    tools = ""

    for s in [i.strip() for i in sent.split(".")]:
        a = Annotator()
        b = a.getAnnotations(s.encode('utf-8'))

        dictlist = b['srl']
        for dict in dictlist:
            if 'V' in dict:
                if lemmatizer.lemmatize(dict["V"].lower(),
                                        'v') in verbs_target:
                    temp1 = temp2 = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp1 += dict['A0']
                    # temp += " :|: "
                    if "A1" in dict and not dict['A1'] == "":
                        temp2 += dict['A1']
                    if not temp1 == "":
                        temp1 = getTools(temp1)
                        tools += temp1 + ":-----:"
                    if not temp2 == "":
                        temp2 = getTargets(temp2)
                        target += temp2 + ":-----:"
                if lemmatizer.lemmatize(dict["V"].lower(), 'v') in verbs_tool:
                    temp = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp += dict['A0']
                    # temp += "|"
                    # if "A1" in dict:
                    #     temp += dict['A1']
                    if not temp == "":
                        temp = getTools(temp)
                        tools += temp + ":-----:"

        # print("SemanticRoleLabel :::: {}".format(b['srl']))
        # print("2nd:\n{}".format([x, y] for x,y in [b['ner']]))

    print("Target :::: " + target)
    print("Tools :::: " + tools)
Ejemplo n.º 15
0
def complexQuery(term):
    #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query="+term+"&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
    #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="+term))

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(term, dep_parse=True)['dep_parse']
    dp_list = dep_parse.split('\n')

    #spotlightTerms = WordNet.spotlightSearch(term)
    #print "spotlight terms %s" %spotlightTerms
    #print "dp list  %s" %dp_list

    spotlightTerms = spotlight.annotate(
        'http://spotlight.sztaki.hu:2222/rest/annotate',
        term,
        confidence=0.3,
        support=20,
        spotter='Default')
    #print term, '\t', spotlightTerms[1].get('URI')
    #print spotlightTerms[0].get('URI')
    secondDep = ""
    query = []

    for prep in dp_list:
        elementPrep = "prep"
        if elementPrep in prep:
            print("We found preposition1: %s" %
                  prep[prep.find("_") + 1:prep.find("(")])
            prepType = prep[prep.find("_") + 1:prep.find("(")]
            print("We found preposition2: %s" %
                  prep[prep.find(" ") + 1:prep.find(")")])
            secondDep = prep[prep.find(" ") + 1:prep.find(")")].split("-")
            print secondDep[0]
            query.append(prepType)
            query.append(secondDep[0])
            if prepType == "like":
                results = DBPedia.dpbediaQuery(prepType, secondDep[0])
            else:
                results = DBPedia.dpbediaQuery(prepType,
                                               spotlightTerms[1].get('URI'))
            print results

    for query in results:

        test = json.load(
            urllib2.urlopen(
                "http://www.freesound.org/apiv2/search/text/?query=" + query +
                "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
        test2 = json.load(
            urllib2.urlopen(
                "https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="
                + query))

    print(test)
    #print(test2)

    return test, test2
Ejemplo n.º 16
0
    def __init__(self):

        self.practNLP_annotator = Annotator()
        self.tokenizer_sent = nltk.tokenize.sent_tokenize
        self.pool = Pool(self.nThreads)
        self.sent_tokenizers = []
        self.annotators = []
        for i in xrange(self.nThreads):
            self.sent_tokenizers.append(nltk.tokenize.sent_tokenize)
            self.annotators.append(Annotator())
def compute_Chunks(corpus):
      Chunk_Tags=[]
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            Chunk_Tags.append(chunk)
      return Chunk_Tags
def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        Chunk_Tags.append(chunk)
    return Chunk_Tags
Ejemplo n.º 19
0
    def SRLAnnotation( self, sentence ):
        """
        Use SENNA library to perform SRL(semantic role labelling) on specific sentence.

        :param sentence: the specific sentence to be handled
        :type sentence: str
        :return:
        :rtype: list({})
        """
        annotator = Annotator()
        return annotator.getAnnotations( sentence )["srl"]
Ejemplo n.º 20
0
def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    #for sentence in corpus:
    chunks = annotator.getAnnotations(corpus)['chunk']
    chunk = ""
    for elem in chunks:
        chunk = chunk + elem[1] + " "
    # print chunk  To see what these chucks are
    Chunk_Tags.append(chunk)
    return Chunk_Tags
Ejemplo n.º 21
0
def test3():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent)

    print result["pos"]
    print result['ner']
    print result['chunk']
    print result['verbs']
    print result['srl']
Ejemplo n.º 22
0
def test3():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent)

    print result["pos"]
    print result['ner']
    print result['chunk']
    print result['verbs']
    print result['srl']
Ejemplo n.º 23
0
def compute_NER(corpus):
    NER = []
    fi = open(read_property('NER_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        ners = annotator.getAnnotations(sentence)['ner']
        ner = ""
        for elem in ners:
            ner = ner + elem[1] + " "
        print ner
        fi.write(ner + "\n")
        NER.append(ner)
    return NER
Ejemplo n.º 24
0
def draw_tree():

    annotator = Annotator()
    result = annotator.getAnnotations(sent11)
    tree_str = result['syntax_tree']
    print
    # print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print tree.pprint()
    print
    print "Root label=", tree.label()
    tree.draw()
Ejemplo n.º 25
0
def draw_tree():   
    
    
    annotator=Annotator()
    result = annotator.getAnnotations(sent11)
    tree_str = result['syntax_tree']
    print     
   # print tree_str
    
    tree = Tree.fromstring(tree_str)[0]
    print tree.pprint()
    print
    print "Root label=",tree.label()
    tree.draw()
Ejemplo n.º 26
0
def compute_Chunks(corpus):
    #Chunk_Tags=[]
    fi = open(read_property('Chunk_features_test_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        print chunk
        fi.write(chunk + "\n")
        #Chunk_Tags.append(chunk)
    #print "The bag of words for Chunks is ",Chunk_Tags
    fi.close()
def compute_Chunks(corpus):
      #Chunk_Tags=[]
      fi=open(read_property('Chunk_features_train_path'),"w")
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            #print chunk
	    fi.write(chunk+"\n")
            #Chunk_Tags.append(chunk)
      #print "The bag of words for Chunks is ",Chunk_Tags
      fi.close()
Ejemplo n.º 28
0
def compute_POS_Tags(corpus):
    #POS=[]
    fi = open(read_property('POS_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        pos_seq = annotator.getAnnotations(sentence)['pos']
        #print pos_seq
        pos_tags = ""
        for pos in pos_seq:
            pos_tags = pos_tags + pos[1] + " "
        fi.write(pos_tags + "\n")
        print pos_tags  ###############
        #POS.append(pos_tags)
    #print "The bag of words of POS is ",POS
    fi.close()
Ejemplo n.º 29
0
def rawfile_to_sentencefile_dir():

    indir = sys.argv[1]
    outdir = sys.argv[2]
    counter = 1
    try:
        os.makedirs(outdir)
    except:
        print('dir existed')
        pass
    time_start = time.time()
    annotator = Annotator()
    a = os.listdir(indir)
    part_data = os.listdir(indir)[int(sys.argv[3]):int(sys.argv[4])] # devide data into several parts,we need to set the start-end
    #part_data = os.listdir(indir)[0:8]
    for fname in part_data:
        if os.path.splitext(fname)[1] == '.summary':
           if not os.path.exists(os.path.join(outdir, fname.split('.')[0]+'.summary'+'.new')): #determine whether the file has been processed
                print(fname)
                #time_start = time.time()
                rawfile_to_sentencefile(annotator,os.path.join(indir, fname), os.path.join(outdir, fname+'.new'))
                counter = counter+1
                #time_end = time.time()
                #print('totally cost', time_end - time_start, 'Number', counter)
           else:
               print('skip', fname )

    time_end = time.time()
    print('totally cost: ', time_end - time_start, 'file number: ', counter-1)
Ejemplo n.º 30
0
    def __init__(self, sentence, question, answer, nlp, srl=None):
        if srl == None:
            self.ascii_sentence = unicodedata.normalize('NFKD',
                                                        sentence).encode(
                                                            'ascii', 'ignore')
            self.ascii_question = unicodedata.normalize('NFKD',
                                                        question).encode(
                                                            'ascii', 'ignore')
            self.ascii_answer = unicodedata.normalize('NFKD', answer).encode(
                'ascii', 'ignore')
            self.annotator = Annotator()
            self.srl = self.annotator.getAnnotations(
                self.ascii_sentence)['srl']
            self.answer_srl_label = self.set_answer_srl_label()
        else:
            self.srl = srl

        self.nlp = nlp
        self.raw_sentence = sentence
        self.raw_question = question
        self.raw_answer = answer
        self.spacy_sent = self.nlp(self.raw_sentence)
        self.spacy_ques = self.nlp(self.raw_question)
        self.answer_length = self.set_answer_length()
        self.spacy_answer = self.set_spacy_answer()
        self.answer_pos = self.set_answer_pos()
        self.answer_ner = self.set_answer_ner()
        self.answer_ner_iob = self.set_answer_ner_iob()
        self.answer_depth = self.set_answer_depth()
        self.answer_word_count = self.set_answer_word_count()
        self.all_pos_tags = [
            'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
            'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB',
            'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN',
            'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'PUNCT'
        ]
        self.all_ner_tags = [
            'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT',
            'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
            'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'
        ]
        self.all_srl_labels = [
            'V', 'A0', 'A1', 'A2', 'C-arg', 'R-arg', 'AM-ADV', 'AM-DIR',
            'AM-DIS', 'AM-EXT', 'AM-LOC', 'AM-MNR', 'AM-MOD', 'AM-NEG',
            'AM-PNC', 'AM-PRD', 'AM-PRP', 'AM-REC', 'AM-TMP'
        ]
def detector(mail,outputfileName):

	annotator=Annotator()
	sentences = nltk.sent_tokenize(mail)
	probableActionItemSentences = []
	for sentence in sentences:
		text = nltk.word_tokenize(sentence)
		posTags = nltk.pos_tag(text)
		for tags in posTags:
			if tags[1]=="VB":
				probableActionItemSentences.append(sentence)
				break
	for sentence in probableActionItemSentences:
		srLabels = annotator.getAnnotations(sentence)['srl']
		#print(srLabels)
		depParsedContent = annotator.getAnnotations(sentence,dep_parse=True)
		#print(depParsedContent)
		root = depParsedContent['dep_parse']
		root = root[root.find('root('):]
		root = root[:root.find('\n')]
		root = root[root.find(',')+2:root.rfind('-')]
		parsedList = depParsedContent['srl']
		owner = None
		ownerFound = False
		for parsedMap in parsedList:
			if 'V' in parsedMap and parsedMap['V'] == root:
				if 'A0' in parsedMap:
					owner = parsedMap['A0']
					ownerFound = True
				else:
					owner = 'You'
					ownerFound = True
				break
		if not ownerFound:
			for parsedMap in parsedList:
				if 'A0' in parsedMap:
					ownerFound = True
					if parsedMap['A0'].lower() == 'you' or parsedMap['A0'].lower() == 'we' or parsedMap['A0'].lower() == 'us':
						owner = parsedMap['A0']
						ownerFound = True
						break
		if ownerFound and owner==None:
			print("")
		else:
			
			outputfile.write("OWNER : "+owner+" SENTENCE : "+sentence+"\n")
Ejemplo n.º 32
0
    def get_filtered_events(self, doc_id, raw_text, entity):

        filtered_events = []
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()

        # for each sentence determine if it pertains to the entity
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            events = self.get_filtered_verbs_with_vecs(
                doc_id, list(annotator.getAnnotations(sentence)['srl']),
                entity)
            # print events.__len__()
            if events.__len__() > 0:
                filtered_events.extend(events)

        # return list of events
        return filtered_events
Ejemplo n.º 33
0
 def readfile(self, num_of_lines):
     n = 0
     annotator = Annotator()
     with open(self.path + self.file_name) as f:
         reader = csv.reader(f)
         for l in reader:
             if n < num_of_lines:
                 line = l[0].split("\t")
                 heading = line[0]
                 true_passage = line[1]
                 false_passage = line[2:]
                 document = Document(heading, true_passage, false_passage)
                 anno = annotator.getAnnotations(
                     document.get_parsed_heading()[0])
                 pprint(anno)
                 n += 1
             else:
                 break
Ejemplo n.º 34
0
def test_tree2():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print result['syntax_tree']
    print "--------------------"
    tree2 = Tree(tree_str)
    print len(tree2)
    print "--------------------"

    for item in tree2[0]:
        print type(item)
        print item
Ejemplo n.º 35
0
def extract_wpcn_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=False)
    words_data = {}
    for i in range(len(feature['words'])):
        words_data[i + 1] = {
            'wid': i + 1,
            'word': feature['words'][i],
            'chk': feature['chunk'][i][1],
            'ner': feature['ner'][i][1],
            'pos': feature['pos'][i][1]
        }
    return words_data
Ejemplo n.º 36
0
def check_neg(text, keyword):

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

    dp_list = dep_parse.split('\n')
    pattern = re.compile(r'.+?\((.+?), (.+?)\)')

    edges = []
    for dep in dp_list:
        m = pattern.search(dep)
        word1 = m.group(1).split('-')[0]
        word2 = m.group(2).split('-')[0]
        # print word1, word2
        if (word1 == keyword and word2 in neg_words) or (word1 in neg_words
                                                         and word2 == keyword):
            return 1

    return 0
Ejemplo n.º 37
0
def test_tree2():
    
    
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."    
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print result['syntax_tree']
    print "--------------------"
    tree2 = Tree(tree_str)
    print len(tree2)   
    print "--------------------"
    
    for item in tree2[0]:
        print type(item)
        print item
Ejemplo n.º 38
0
    def process(self, text):
        annotator = Annotator()
        tokens = annotator.getAnnotations(text)
        chunks = tokens['chunk']
        ner = tokens['ner']
        pos = tokens['pos']
        srl = tokens['srl']
        words = tokens['words']

        for part in srl:
            self.assign_objects(part)
        
        returnCode = ""
        print self.objects
        for object in self.objects:
            print self.get_item(object)
            if self.get_item(object) is not None:
                returnCode += str(self.get_item(object))

        return returnCode
Ejemplo n.º 39
0
def preprocess(infile, outfile, posfile, index):
    annotator = Annotator()
    wnl = WordNetLemmatizer()
    o = open(outfile, 'w'); p = open(posfile, 'w'); f = open(infile)
    text = f.readlines()
    for s in text:
        s = s.strip().split("\t", index)
        # make it lower
        sent = s[index].lower()
        # remove special characters
        sent = sent.strip(string.punctuation)
        # extend contractions
        sent = re.sub(r"n't", " not", sent)
        sent = re.sub(r"'ve", " have", sent)
        sent = re.sub(r"'d", " would", sent)
        sent = re.sub(r"'ll", " will", sent)
        sent = re.sub(r"'m", " am", sent)
        sent = re.sub(r"'s", " is", sent)
        sent = re.sub(r"'re", " are", sent)

        # lematize and get POS tags
        pos = annotator.getAnnotations(sent)["pos"]
        lemmas = [wnl.lemmatize(w,'v') if t.startswith('V') else wnl.lemmatize(w, 'n') for (w,t) in pos]
        sent = " ".join(lemmas)
        pos = " ".join([x[1] for x in pos])

        out_string = ""
        pos_string = ""
        for j in range(0,index):
            out_string += s[j] + "\t"
            pos_string += s[j] + "\t"

        out_string += sent + "\n"
        pos_string += pos + "\n"
        o.write(out_string)
        p.write(pos_string)

    f.close()
    o.close()
    o.close()
Ejemplo n.º 40
0
class srlGraph(Base):
    def __init__(self, docpath):
        super(srlGraph, self).__init__(docpath)
        self.stopwords = stopwords.words("english")
        self.annotator = Annotator()

    def srl_corpus_extraction(self, stopwords=None):

        sem_rl = self.annotator.getAnnotations(self.corpus,
                                               dep_parse=True)['srl']
        srl_corpus = [
            self.annotator.getAnnotations(doc)['srl'] for doc in self.corpus
        ]
        return srl_corpus

    def srl_document_extraction(self, document_id=0, stopwords=None):
        return self.annotator.getAnnotations(self.corpus[document_id],
                                             dep_parse=True)['srl']

    def get_doc_canon(self, document_id):
        sem_rl = self.annotator.getAnnotations(self.corpus[document_id],
                                               dep_parse=True)['srl']
        canon = [
            en.singularize(word)
            for word in str(TextBlob(sem_rl[0]['C-A1'])).split()
            if word not in stopwords
        ]

        blob = TextBlob(sem_rl[0]['A1'])
        nounPhrases = blob.noun_phrases.singularize()
        sr_verb_concept = self.annotator.getAnnotations(sem_rl[0]['A1'])['srl']

        concat_noun_concepts = set(
            sum([word.split() for word in nounPhrases], []))
        predicates = list(set(canon) - concat_noun_concepts)

        return canon
Ejemplo n.º 41
0
class Worker_NLP:
    '''
    worker class for NLP
    '''
    nThreads = 4
    pool = None
    ith = 0
    tokenizer_sent = None
    practNLP_annotator = None
    sent_tokenizers = None
    annotators = None

    def __init__(self):

        self.practNLP_annotator = Annotator()
        self.tokenizer_sent = nltk.tokenize.sent_tokenize
        self.pool = Pool(self.nThreads)
        self.sent_tokenizers = []
        self.annotators = []
        for i in xrange(self.nThreads):
            self.sent_tokenizers.append(nltk.tokenize.sent_tokenize)
            self.annotators.append(Annotator())

    def getSRL(self, rawTxt):
        sents = sent_tokenize(rawTxt)
        srls = []
        for sent in sents:
            srl = self.practNLP_annotator.getAnnotations(sent)['srl']
            srls.append(srl)
        return srls

    def getSRL_parallel(self, rawTxts):
        tasks = []  #[rawTxt, tokenizer, annotator]
        for i, rawTxt in enumerate(rawTxts):
            tokenizer_tmp = self.sent_tokenizers[self.ith]
            annotator_tmp = self.annotators[self.ith]
            self.ith = (self.ith + 1) % self.nThreads
            tasks.append((rawTxt, tokenizer_tmp, annotator_tmp))

        results = []
        for task in tasks:
            result = self.pool.apply_async(compute_task_atom, args=task)
            results.append(result)

        srls_results = [item.get() for item in results]
        return srls_results
Ejemplo n.º 42
0
#!usr/bin/python
'''
Python Script for SENNA functionality
Uses PracNLPTools for Sementic Role Labeling
NOT USED 
'''

import csv
import re
from practnlptools.tools import Annotator
annotator=Annotator()

print("Running Shallow Semantic Parser")
patternForSymbol = re.compile(r'(\ufeff)', re.U)
comments=[]
#reads in CSV file
with open('Dataset/dataset.csv','rb') as dataFile:
    reader = csv.reader(dataFile, delimiter=',')
    for row in reader:
        #row[0] = row[0].decode('utf-8')
        rowEdited = re.sub(patternForSymbol, '', row[0])
        comment = rowEdited if rowEdited != "" else row[0]
        sentiment = row[1]
        comments.append((comment, sentiment))


for index,comment in enumerate(comments):
	if(index<100):
		print comment[0]
		print(annotator.getAnnotations(comment[0])['srl'])
		print("==========================")
Ejemplo n.º 43
0
__author__ = 'hz'
from practnlptools.tools import Annotator

text = "Disclosed is an organic light-emitting diode (OLED) display panel. An OLED display panel includes a plurality of signal lines and a thin film transistor formed on a substrate, an interlayer insulating layer, a first electrode, a bank, an organic light-emitting layer, a second electrode, a first passivation layer, an organic layer, a second passivation layer and a barrier film, wherein the bank is formed to completely cover the interlayer insulating layer, and an inclination formed by side surfaces of the bank and the interlayer insulating layer is made to be gradual."

# text = "Disclosed is an organic light-emitting diode (OLED) display panel."
# semantic role labelling
text = 'Unlike the classic PPP technique, in our new approach, the ionospheric-free linear combination is not used.'
annotator = Annotator()
result = annotator.getAnnotations( text )["srl"]

print( type(result) )
print( result )
 def _fit(self, sent_list_of_str, dep_parse):
     '''Return annotations from a list of strings, as a list of dicts
     dep_parse is dependency parsing optional feature (takes a long time)'''
     annotator = Annotator()
     return annotator.getBatchAnnotations(sent_list_of_str, dep_parse)
Ejemplo n.º 45
0
def test2(): 
    sent = "There are people dying make this world a better place for you and for me."
    
    annotator=Annotator()    
    result = annotator.getAnnotations(sent,dep_parse=True)
    print  result
Ejemplo n.º 46
0
#Pre-defining a set of desired verbs

path = "C:/Users/hp/Desktop/bigdata/txt/"
data = []
verb_file = open("./set3.txt", "rw+") 
desired_verbs = verb_file.readlines()

#Performing Stemming

stemmed_desired_verbs=[]
stemmer=stem.snowball.EnglishStemmer()

for word in desired_verbs:
    stemmed_desired_verbs.append(stemmer.stem(word))
   
annotator=Annotator()

#Implementation of Semantic Role Labelling

f = open('out.csv', 'wt')
csv.register_dialect('lineterminator',lineterminator='\n')
writer = csv.writer(f, dialect = csv.get_dialect('lineterminator'))
writer.writerow( ('A0', 'A1', 'V', 'fileName'))
for filename in os.listdir(path):
    print 'reading', filename
    text_file = open(path + filename,"r")
    file_content = text_file.readlines()
    data.append((filename,file_content))
    text_file.close()

#Annotating the striped text
Ejemplo n.º 47
0
def generate_shortestpath (sent,left_term,left_start,right_term,right_start):
    annotator = Annotator()
    #print "========before:",left_term,left_start,right_term,right_start

    '''
    left_sent=" ".join(sent.split()[:left_start])
    right_sent=" ".join(sent.split()[:right_start])
    if re.search("[A-Za-z]-[A-Za-z]",left_term):
        info=left_term.split("-")
        left_term=info[-1]
        #left_start=left_start-1
    if re.search("[A-Za-z]-[A-Za-z]",right_term):
        info=right_term.split("-")
        right_term=info[-1]
       # right_start=right_start-1
    '''
    right_term=re.sub('\(','LRB',right_term)
    right_term=re.sub('\)','RRB',right_term)
    left_term=re.sub('\(','LRB',left_term)
    left_term=re.sub('\)','RRB',left_term)
    new_left_start,new_left_term,new_right_start,new_right_term=update_loc(sent,left_start,right_start)



    '''
    # adjust start coordination # denpendency parser wil split " - " and "'s"

   # print "left: ", left_sent
   # print "right: ",right_sent

    poss=re.compile('\w+\'s')
    conj=re.compile('[A-Za-z]+\-[A-Za-z]+')
    comma=re.compile('\w,')

    result_poss_left=poss.findall(left_sent)
    result_conj_left=conj.findall(left_sent)
    result_comma_left=comma.findall(left_sent)
    left_start=left_start+len(result_conj_left)*2+len(result_poss_left)+len(result_comma_left)
    #print "left:",len(result_conj_left),len(result_poss_left)

    result_poss_right=poss.findall(right_sent)
    result_conj_right=conj.findall(right_sent)
    result_comma_right=comma.findall(right_sent)
    right_start=right_start+len(result_conj_right)*2+len(result_poss_right)+len(result_comma_right)
   # print "right:", len(result_conj_right),len(result_poss_right),len(result_comma_right)

    print "after:",left_term,left_start,right_term,right_start
    '''

    left=new_left_term+"-"+str(new_left_start)
    right=new_right_term+"-"+str(new_right_start)
    #print "=====",left,right
    #sent=re.sub('\-',' - ',sent)
    #print sent

    sent=re.sub('\(','LRB',sent)
    sent=re.sub('\)','RRB',sent)

    #print sent
    dep_parse=annotator.getAnnotations(sent, dep_parse=True)['dep_parse']
    tree=annotator.getAnnotations(sent, dep_parse=True)['syntax_tree']
    #print dep_parse
    dp_list = dep_parse.split('\n')
    pattern = re.compile(r'.+?\((.+?), (.+?)\)')
    edges = []
    for dep in dp_list:

        #print dep

        m = pattern.search(dep)
        if m:
            edges.append((m.group(1), m.group(2)))
    graph = nx.Graph(edges)
    #print right
    if right not in graph.nodes():
        print "right",left_term, right_term
        return "right"
    if left not in graph.nodes():
        print "left", left_term, right_term
        return "left"
    shorttest_path=nx.shortest_path_length(graph, source=left, target=right)
   # print
    return  shorttest_path
Ejemplo n.º 48
0
def build_d(cat_d):
    dir_path='lists'
    all_files=os.listdir(dir_path)
    for filename in all_files:
        filepath=dir_path+'/'+filename
        if os.path.isfile(filepath):
            cat=filename.strip().split('.')[0]
            with open(filepath,'r') as fp:
                for line in fp:
                    word=line.strip()
                    cat_d[word]=cat  

cat_d={}
build_d(cat_d)
annotator=Annotator()
wordnet_lemmatizer = WordNetLemmatizer()
alchemyapi = AlchemyAPI()
template={}
full_template=['AM-MOD','A0','AM-ADV', 'AM-NEG','V','C-V','AM-DIR','A1','A2','A3','A4','AM-PNC','AM-MNR','AM-LOC','AM-TMP','C-A1']
for item in full_template:
    copy_template=full_template[:]
    copy_template.remove(item)
    if 'AM-LOC' in copy_template:
        copy_template.remove('AM-LOC')
    if 'AM-TMP' in copy_template:
        copy_template.remove('AM-TMP')
    if 'AM-MNR' in copy_template:
        copy_template.remove('AM-MNR')
    template[item]=copy_template
Ejemplo n.º 49
0
#!/usr/bin/env python
from practnlptools.tools import Annotator
import sys
annotator=Annotator()

if __name__=='__main__':
    for line in sys.stdin:
        if line[0]=="#":
            continue
        line=line.strip()
        annotations=annotator.getAnnotations(line)
        print annotations['srl']
Ejemplo n.º 50
0
def update_loc(sent,left_start,right_start):
    annotator = Annotator()
    sent=re.sub('\(','LRB',sent)
    sent=re.sub('\)','RRB',sent)
    words=sent.split()
    #print words[left_start],words[right_start]
    words[left_start]=words[left_start]+'aaaaa'
    words[right_start]=words[right_start]+'bbbbb'
  #  print words

    sent=' '.join(words)
    tags=annotator.getAnnotations(sent)
   # print "===", tags

   # print "chunks:      ", tags['chunk']
    i=0
    pre_word=''
    pre_pre_word=''
    j=0
    left_term = ''
    right_term = ''


    for word in tags['chunk']:
        i+=1
        left_pattern='^(.*)aaaaa$'
        right_pattern='^(.*)bbbbb$'
        left=re.search(left_pattern,word[0])
        right=re.search(right_pattern,word[0])


        if left:
            #print "ttleft"
            left_term=left.group(1)
            left_start=i
            if left_term=='':
                left_term=pre_word
                left_start=left_start-1
                j=1
                if pre_word=='-':
                    left_term=pre_pre_word
                    left_start=left_start-1


        if right:
            #print "rightright"
            right_term=right.group(1)
            right_start=i
            if right_term=='':
                right_term=pre_word
                right_start=right_start-1
                j=2
                if pre_word=='-':
                    right_term=pre_pre_word
                    right_start=right_start-1

        pre_pre_word=pre_word
        pre_word=word[0]
    if j==1:
        if right_start>left_start:
            right_start=right_start-1
    if j==2:
        if left_start>right_start:
            left_start=left_start-1
    #print j
    #print "=++++", left_start,left_term,right_start,right_term
    return (left_start,left_term,right_start,right_term)