Ejemplos de Annotator en Python, ejemplos de practnlptools.tools.Annotator en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: utility.py Proyecto: deepakrana47/RED-machine-translation

def extract_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=True)

    if feature['dep_parse'] == '':
        return -2
    a = feature['dep_parse'].split('\n')
    words_data = {}
    d = []
    for i in a:
        dep = re.sub(r'^[^\(]+\(|\)$', '', i)
        try:
            p, c = dep.split(', ')
        except ValueError:
            pass
        try:
            t1 = p.split('-')
            pid = int(t1[len(t1) - 1])
            t2 = c.split('-')
            wid = int(t2[len(t2) - 1])
        except ValueError:
            if re.match('[\d]+\'', t1[len(t1) - 1]):
                pid = int(re.sub(r'\'', '', t1[len(t1) - 1])) + 0.1
                t2 = c.split('-')
                wid = int(t2[len(t2) - 1])
            elif re.match('[\d]+\'', t2[len(t2) - 1]):
                pass
            continue
        d.append((wid, pid))
    t1 = [id for id in d]
    d, _ = remove_dep(t1)
    for wid, pid in d:
        add_ids(words_data, wid, pid)
    for i in range(len(feature['words'])):
        if i + 1 not in words_data:
            words_data[i + 1] = {
                'wid': i + 1,
                'pid': -1,
                'word': feature['words'][i],
                'chk': feature['chunk'][i][1],
                'ner': feature['ner'][i][1],
                'pos': feature['pos'][i][1]
            }
        elif i + 1 in words_data:
            words_data[i + 1]['word'] = feature['words'][i]
            words_data[i + 1]['chk'] = feature['chunk'][i][1]
            words_data[i + 1]['ner'] = feature['ner'][i][1]
            words_data[i + 1]['pos'] = feature['pos'][i][1]
    words_data['syntax_tree'] = feature['syntax_tree']
    words_data['tree'] = feature['tree']
    words_data['verbs'] = feature['verbs']
    words_data['srl'] = feature['srl']
    # Global.accepted += 1
    return words_data

Ejemplo n.º 2

0

Mostrar archivo

def test_tree4():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    sent = "B.S. in Computer Science , a related degree or its equivalent"
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."

    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"

    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print
    print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=", tree.label()
    tree.draw()

Ejemplo n.º 3

0

Mostrar archivo

    def getword2vec(self, raw_text):
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()
        counter = 0
        doc_vec = []
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            srl = list(annotator.getAnnotations(sentence)['srl'])
            word2vec = []
            # get the event structure for each sentence
            for s in srl:
                if 'V' in s:
                    # print s['V']
                    word2vec = self.getvector(s['V'])
                    # print word2vec
                else:
                    print 'No verb found in sentence'
                    return
                if 'A0' in s:
                    # print s['A0']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A0']))

                if 'A1' in s:
                    # print s['A1']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A1']))
            if counter == 0:
                doc_vec = word2vec
            else:
                doc_vec = self.addVectors(doc_vec, word2vec)
            counter = counter + 1

Ejemplo n.º 4

0

Mostrar archivo

def get_shortest_path(a, b):
	text = a + b 

	annotator = Annotator()
	dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

	dp_list = dep_parse.split('\n')
	pattern = re.compile(r'.+?\((.+?), (.+?)\)')
	edges = []
	
	for dep in dp_list:
		m = pattern.search(dep)
		edges.append((m.group(1), m.group(2)))
	
	graph = nx.Graph(edges)  
	
	shortest_paths = [] 
	
	a = a.strip()
	b = b.strip()
	
	a = a.split()
	b = b.split()
	
	for i in a: 
		for j in b: 
			shortest_paths.append(nx.shortest_path_length(graph, source=i, target=j))
	
	print(shortest_paths)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: try_parse_1.py Proyecto: folagit/resumatcher

def test_tree4():   
    
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."    
    sent = "B.S. in Computer Science , a related degree or its equivalent"    
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."    
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."
    
    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"
   
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print     
    print tree_str
    
    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=",tree.label()
    tree.draw()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: target_extraction.py Proyecto: rahulcs068/Question-Answering-Bot

def compute_POS(line):
    annotator = Annotator()
    pos = annotator.getAnnotations(line)['pos']
    pos_tag = []
    for p in pos:
        pos_tag.append(p[1])
    return pos_tag

Ejemplo n.º 7

0

Mostrar archivo

Archivo: try_parse_1.py Proyecto: folagit/resumatcher

def test1():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    result = annotator.getAnnotations(sent)

    #print result
    print type (result)
    print  result.keys()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: try_parse_1.py Proyecto: folagit/resumatcher

def test_deep():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent,dep_parse=True)

    print result['dep_parse']

Ejemplo n.º 9

0

Mostrar archivo

Archivo: specificworker.py Proyecto: smartpolitech/ACHO2017

 def srl(self, sen_dest):
     ant = Annotator()
     if sen_dest.upper().split()[0] == "UP":
         v = sen_dest.upper().split()
         v[0] = "RAISE"
         sen_dest = str(v)
     sen_srl = ant.getAnnotations(sen_dest)['syntax_tree']
     return sen_srl

Ejemplo n.º 10

0

Mostrar archivo

Archivo: additional_feature_engineering.py Proyecto: MatBilML/is_that_a_duplicate_quora_question

def get_annotations(question):
    annotator = Annotator()
    annotations = annotator.getAnnotations(question)
    srl = annotations['srl']
    verbs = annotations['verbs']
    ner = annotations['ner']
    chunk = annotations['chunk']
    return srl, verbs, ner, chunk

Ejemplo n.º 11

0

Mostrar archivo

def test1():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    result = annotator.getAnnotations(sent)

    #print result
    print type(result)
    print result.keys()

Ejemplo n.º 12

0

Mostrar archivo

Archivo: jelly_data.py Proyecto: gummibearehausen/jelly

 def annotation(self, n):
     parsed_heading = self.get_parsed_heading()
     annotator = Annotator()
     try:
         annotation = annotator.getAnnotations(parsed_heading[n])
         return annotation
     except:
         pass

Ejemplo n.º 13

0

Mostrar archivo

def test_deep():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent, dep_parse=True)

    print result['dep_parse']

Ejemplo n.º 14

0

Mostrar archivo

def semantic_role_label():
    lemmatizer = WordNetLemmatizer()
    verbs_target = ["ensnare", "infect", "use", "target"]
    verbs_tool = ["attack"]

    # sent = "Mirai-Based Masuta Botnet Weaponizes Old Router Vulnerability. By Ionut Arghire on January 24, 2018. inShare. A new piece of malware based on Mirai's publicly released source code has been observed at large, ensnaring devices into a botnet, targeted Internet of Things. Known as Masuta, the botnet has at least two variants at large, and is believed to be the work of a well-known IoT threat actor, NewSky Security says. What?s also unique to the botnet is that it exploits an old router vulnerability, being the first threat known to weaponize it in a botnet campaign. Masuta (Japanese for master) botnet?s source code was found on an invite only dark forum. The malware?s configuration file, the researchers discovered, uses a different seed of the cipher key compared to Mirai, having the strings in the configuration files XORed by 0x45. Thus, the researchers discovered that it uses the domain nexusiotsolutions(dot)net, the command and control (C&C) server that Nexus Zeta, the individual involved in the recent Satori attacks, uses. The domain was registered using the nexuszeta1337@gmail(.)com email address. Thus, NewSky Security suggests that Nexus Zeta has been involved in the creation of the Masuta botnet, in addition to building Satori, the Mirai variant that has been wreaking havoc over the past couple of months. In fact, Masuta isn?t new either, and attacks involving it have been steadily increasing since September, and the botnet?s standard variant has been observed using several known/weak/default credentials to compromise IoT devices. An evolved variant of Masuta, called PureMasuta, contains the most typical of Mirai style code, and a list of weak credentials to use. What makes this malware variant stand out, however, is its usage of EDB 38722 D-Link exploit. The exploit PureMasuta uses resides in the HNAP (Home Network Administration Protocol), which is based on the SOAP protocol. It is possible to craft a SOAP query to bypass authentication by using hxxp://purenetworks.com/HNAP1/GetDeviceSettings, and improper string handling can lead to arbitrary code execution, and an attacker can abuse this combination of issues to run code on targeted devices. What the botnet does is to download a shell script from the C&C server and run it. Thus, the malware author first bypasses authentication and then executes code on the targeted devices. The PureMasuta variant uses the same C&C server (93.174.93.63) as the original Masuta variant, which led the researchers to believe it is the evolved creation of the same threat actor. Nexus Zeta is no stranger when it comes to implementing SOAP related exploits. The threat actor has already been observed in implementing two other known SOAP related exploits, CVE-2014-8361 and CVE-2017-17215 in his Satori botnet project, NewSky Security notes. Thus, the TR-069 bug and EDB 38722 are the third and fourth SOAP related exploits abused by IoT botnets. Protocol exploits are more desirable for threat actors as they usually have a wider scope. A protocol can be implemented by various vendors/models and a bug in the protocol itself can get carried on to a wider range of devices, the researchers conclude."

    # sent = "Mirai, the infamous botnet used in the recent massive distributed denial of service (DDoS) attacks against Brian Krebs' blog and Dyn's DNS infrastructure, has ensnared Internet of Things (IoT) devices in 164 countries, researchers say."

    if len(sys.argv) != 2:
        print("NOPE")
        exit()

    fh = open(sys.argv[1], "r")
    sent = fh.read()
    # sent = sys.argv[1]

    target = ""
    tools = ""

    for s in [i.strip() for i in sent.split(".")]:
        a = Annotator()
        b = a.getAnnotations(s.encode('utf-8'))

        dictlist = b['srl']
        for dict in dictlist:
            if 'V' in dict:
                if lemmatizer.lemmatize(dict["V"].lower(),
                                        'v') in verbs_target:
                    temp1 = temp2 = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp1 += dict['A0']
                    # temp += " :|: "
                    if "A1" in dict and not dict['A1'] == "":
                        temp2 += dict['A1']
                    if not temp1 == "":
                        temp1 = getTools(temp1)
                        tools += temp1 + ":-----:"
                    if not temp2 == "":
                        temp2 = getTargets(temp2)
                        target += temp2 + ":-----:"
                if lemmatizer.lemmatize(dict["V"].lower(), 'v') in verbs_tool:
                    temp = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp += dict['A0']
                    # temp += "|"
                    # if "A1" in dict:
                    #     temp += dict['A1']
                    if not temp == "":
                        temp = getTools(temp)
                        tools += temp + ":-----:"

        # print("SemanticRoleLabel :::: {}".format(b['srl']))
        # print("2nd:\n{}".format([x, y] for x,y in [b['ner']]))

    print("Target :::: " + target)
    print("Tools :::: " + tools)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: queryService.py Proyecto: damodamr/ac-webServies

def complexQuery(term):
    #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query="+term+"&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
    #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="+term))

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(term, dep_parse=True)['dep_parse']
    dp_list = dep_parse.split('\n')

    #spotlightTerms = WordNet.spotlightSearch(term)
    #print "spotlight terms %s" %spotlightTerms
    #print "dp list  %s" %dp_list

    spotlightTerms = spotlight.annotate(
        'http://spotlight.sztaki.hu:2222/rest/annotate',
        term,
        confidence=0.3,
        support=20,
        spotter='Default')
    #print term, '\t', spotlightTerms[1].get('URI')
    #print spotlightTerms[0].get('URI')
    secondDep = ""
    query = []

    for prep in dp_list:
        elementPrep = "prep"
        if elementPrep in prep:
            print("We found preposition1: %s" %
                  prep[prep.find("_") + 1:prep.find("(")])
            prepType = prep[prep.find("_") + 1:prep.find("(")]
            print("We found preposition2: %s" %
                  prep[prep.find(" ") + 1:prep.find(")")])
            secondDep = prep[prep.find(" ") + 1:prep.find(")")].split("-")
            print secondDep[0]
            query.append(prepType)
            query.append(secondDep[0])
            if prepType == "like":
                results = DBPedia.dpbediaQuery(prepType, secondDep[0])
            else:
                results = DBPedia.dpbediaQuery(prepType,
                                               spotlightTerms[1].get('URI'))
            print results

    for query in results:

        test = json.load(
            urllib2.urlopen(
                "http://www.freesound.org/apiv2/search/text/?query=" + query +
                "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
        test2 = json.load(
            urllib2.urlopen(
                "https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="
                + query))

    print(test)
    #print(test2)

    return test, test2

Ejemplo n.º 16

0

Mostrar archivo

Archivo: Slave_NLP.py Proyecto: dwang888/NLP_ML_py

    def __init__(self):

        self.practNLP_annotator = Annotator()
        self.tokenizer_sent = nltk.tokenize.sent_tokenize
        self.pool = Pool(self.nThreads)
        self.sent_tokenizers = []
        self.annotators = []
        for i in xrange(self.nThreads):
            self.sent_tokenizers.append(nltk.tokenize.sent_tokenize)
            self.annotators.append(Annotator())

Ejemplo n.º 17

0

Mostrar archivo

Archivo: Training_Coarse_Classification1.py Proyecto: StevenLOL/QuestionClassification

def compute_Chunks(corpus):
      Chunk_Tags=[]
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            Chunk_Tags.append(chunk)
      return Chunk_Tags

Ejemplo n.º 18

0

Mostrar archivo

Archivo: Training_Coarse_Classification1.py Proyecto: nausheenfatma/QuestionClassification

def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        Chunk_Tags.append(chunk)
    return Chunk_Tags

Ejemplo n.º 19

0

Mostrar archivo

    def SRLAnnotation( self, sentence ):
        """
        Use SENNA library to perform SRL(semantic role labelling) on specific sentence.

        :param sentence: the specific sentence to be handled
        :type sentence: str
        :return:
        :rtype: list({})
        """
        annotator = Annotator()
        return annotator.getAnnotations( sentence )["srl"]

Ejemplo n.º 20

0

Mostrar archivo

Archivo: Coarse_model_test.py Proyecto: AhmadSakor/ReMatch-1

def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    #for sentence in corpus:
    chunks = annotator.getAnnotations(corpus)['chunk']
    chunk = ""
    for elem in chunks:
        chunk = chunk + elem[1] + " "
    # print chunk  To see what these chucks are
    Chunk_Tags.append(chunk)
    return Chunk_Tags

Ejemplo n.º 21

0

Mostrar archivo

Archivo: try_parse_1.py Proyecto: folagit/resumatcher

def test3():
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent)

    print result["pos"]
    print result['ner']
    print result['chunk']
    print result['verbs']
    print result['srl']

Ejemplo n.º 22

0

Mostrar archivo

def test3():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent)

    print result["pos"]
    print result['ner']
    print result['chunk']
    print result['verbs']
    print result['srl']

Ejemplo n.º 23

0

Mostrar archivo

def compute_NER(corpus):
    NER = []
    fi = open(read_property('NER_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        ners = annotator.getAnnotations(sentence)['ner']
        ner = ""
        for elem in ners:
            ner = ner + elem[1] + " "
        print ner
        fi.write(ner + "\n")
        NER.append(ner)
    return NER

Ejemplo n.º 24

0

Mostrar archivo

Archivo: tree_analysis.py Proyecto: folagit/resumatcher

def draw_tree():

    annotator = Annotator()
    result = annotator.getAnnotations(sent11)
    tree_str = result['syntax_tree']
    print
    # print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print tree.pprint()
    print
    print "Root label=", tree.label()
    tree.draw()

Ejemplo n.º 25

0

Mostrar archivo

Archivo: tree_analysis.py Proyecto: folagit/resumatcher

def draw_tree():   
    
    
    annotator=Annotator()
    result = annotator.getAnnotations(sent11)
    tree_str = result['syntax_tree']
    print     
   # print tree_str
    
    tree = Tree.fromstring(tree_str)[0]
    print tree.pprint()
    print
    print "Root label=",tree.label()
    tree.draw()

Ejemplo n.º 26

0

Mostrar archivo

def compute_Chunks(corpus):
    #Chunk_Tags=[]
    fi = open(read_property('Chunk_features_test_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        print chunk
        fi.write(chunk + "\n")
        #Chunk_Tags.append(chunk)
    #print "The bag of words for Chunks is ",Chunk_Tags
    fi.close()

Ejemplo n.º 27

0

Mostrar archivo

Archivo: Training_Coarse.py Proyecto: StevenLOL/QuestionClassification

def compute_Chunks(corpus):
      #Chunk_Tags=[]
      fi=open(read_property('Chunk_features_train_path'),"w")
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            #print chunk
	    fi.write(chunk+"\n")
            #Chunk_Tags.append(chunk)
      #print "The bag of words for Chunks is ",Chunk_Tags
      fi.close()

Ejemplo n.º 28

0

Mostrar archivo

def compute_POS_Tags(corpus):
    #POS=[]
    fi = open(read_property('POS_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        pos_seq = annotator.getAnnotations(sentence)['pos']
        #print pos_seq
        pos_tags = ""
        for pos in pos_seq:
            pos_tags = pos_tags + pos[1] + " "
        fi.write(pos_tags + "\n")
        print pos_tags  ###############
        #POS.append(pos_tags)
    #print "The bag of words of POS is ",POS
    fi.close()

Ejemplo n.º 29

0

Mostrar archivo

def rawfile_to_sentencefile_dir():

    indir = sys.argv[1]
    outdir = sys.argv[2]
    counter = 1
    try:
        os.makedirs(outdir)
    except:
        print('dir existed')
        pass
    time_start = time.time()
    annotator = Annotator()
    a = os.listdir(indir)
    part_data = os.listdir(indir)[int(sys.argv[3]):int(sys.argv[4])] # devide data into several parts,we need to set the start-end
    #part_data = os.listdir(indir)[0:8]
    for fname in part_data:
        if os.path.splitext(fname)[1] == '.summary':
           if not os.path.exists(os.path.join(outdir, fname.split('.')[0]+'.summary'+'.new')): #determine whether the file has been processed
                print(fname)
                #time_start = time.time()
                rawfile_to_sentencefile(annotator,os.path.join(indir, fname), os.path.join(outdir, fname+'.new'))
                counter = counter+1
                #time_end = time.time()
                #print('totally cost', time_end - time_start, 'Number', counter)
           else:
               print('skip', fname )

    time_end = time.time()
    print('totally cost: ', time_end - time_start, 'file number: ', counter-1)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: semantic.py Proyecto: exchez/gapster

    def __init__(self, sentence, question, answer, nlp, srl=None):
        if srl == None:
            self.ascii_sentence = unicodedata.normalize('NFKD',
                                                        sentence).encode(
                                                            'ascii', 'ignore')
            self.ascii_question = unicodedata.normalize('NFKD',
                                                        question).encode(
                                                            'ascii', 'ignore')
            self.ascii_answer = unicodedata.normalize('NFKD', answer).encode(
                'ascii', 'ignore')
            self.annotator = Annotator()
            self.srl = self.annotator.getAnnotations(
                self.ascii_sentence)['srl']
            self.answer_srl_label = self.set_answer_srl_label()
        else:
            self.srl = srl

        self.nlp = nlp
        self.raw_sentence = sentence
        self.raw_question = question
        self.raw_answer = answer
        self.spacy_sent = self.nlp(self.raw_sentence)
        self.spacy_ques = self.nlp(self.raw_question)
        self.answer_length = self.set_answer_length()
        self.spacy_answer = self.set_spacy_answer()
        self.answer_pos = self.set_answer_pos()
        self.answer_ner = self.set_answer_ner()
        self.answer_ner_iob = self.set_answer_ner_iob()
        self.answer_depth = self.set_answer_depth()
        self.answer_word_count = self.set_answer_word_count()
        self.all_pos_tags = [
            'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
            'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB',
            'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN',
            'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'PUNCT'
        ]
        self.all_ner_tags = [
            'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT',
            'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
            'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'
        ]
        self.all_srl_labels = [
            'V', 'A0', 'A1', 'A2', 'C-arg', 'R-arg', 'AM-ADV', 'AM-DIR',
            'AM-DIS', 'AM-EXT', 'AM-LOC', 'AM-MNR', 'AM-MOD', 'AM-NEG',
            'AM-PNC', 'AM-PRD', 'AM-PRP', 'AM-REC', 'AM-TMP'
        ]

Ejemplo n.º 31

0

Mostrar archivo

Archivo: action_items_detector.py Proyecto: nikkscorpio/Action-Item-Detector--NLP

def detector(mail,outputfileName):

	annotator=Annotator()
	sentences = nltk.sent_tokenize(mail)
	probableActionItemSentences = []
	for sentence in sentences:
		text = nltk.word_tokenize(sentence)
		posTags = nltk.pos_tag(text)
		for tags in posTags:
			if tags[1]=="VB":
				probableActionItemSentences.append(sentence)
				break
	for sentence in probableActionItemSentences:
		srLabels = annotator.getAnnotations(sentence)['srl']
		#print(srLabels)
		depParsedContent = annotator.getAnnotations(sentence,dep_parse=True)
		#print(depParsedContent)
		root = depParsedContent['dep_parse']
		root = root[root.find('root('):]
		root = root[:root.find('\n')]
		root = root[root.find(',')+2:root.rfind('-')]
		parsedList = depParsedContent['srl']
		owner = None
		ownerFound = False
		for parsedMap in parsedList:
			if 'V' in parsedMap and parsedMap['V'] == root:
				if 'A0' in parsedMap:
					owner = parsedMap['A0']
					ownerFound = True
				else:
					owner = 'You'
					ownerFound = True
				break
		if not ownerFound:
			for parsedMap in parsedList:
				if 'A0' in parsedMap:
					ownerFound = True
					if parsedMap['A0'].lower() == 'you' or parsedMap['A0'].lower() == 'we' or parsedMap['A0'].lower() == 'us':
						owner = parsedMap['A0']
						ownerFound = True
						break
		if ownerFound and owner==None:
			print("")
		else:
			
			outputfile.write("OWNER : "+owner+" SENTENCE : "+sentence+"\n")

Ejemplo n.º 32

0

Mostrar archivo

    def get_filtered_events(self, doc_id, raw_text, entity):

        filtered_events = []
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()

        # for each sentence determine if it pertains to the entity
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            events = self.get_filtered_verbs_with_vecs(
                doc_id, list(annotator.getAnnotations(sentence)['srl']),
                entity)
            # print events.__len__()
            if events.__len__() > 0:
                filtered_events.extend(events)

        # return list of events
        return filtered_events

Ejemplo n.º 33

0

Mostrar archivo

Archivo: jelly_data.py Proyecto: gummibearehausen/jelly

 def readfile(self, num_of_lines):
     n = 0
     annotator = Annotator()
     with open(self.path + self.file_name) as f:
         reader = csv.reader(f)
         for l in reader:
             if n < num_of_lines:
                 line = l[0].split("\t")
                 heading = line[0]
                 true_passage = line[1]
                 false_passage = line[2:]
                 document = Document(heading, true_passage, false_passage)
                 anno = annotator.getAnnotations(
                     document.get_parsed_heading()[0])
                 pprint(anno)
                 n += 1
             else:
                 break

Ejemplo n.º 34

0

Mostrar archivo

def test_tree2():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print result['syntax_tree']
    print "--------------------"
    tree2 = Tree(tree_str)
    print len(tree2)
    print "--------------------"

    for item in tree2[0]:
        print type(item)
        print item

Ejemplo n.º 35

0

Mostrar archivo

Archivo: utility1.py Proyecto: deepakrana47/DT-RAE

def extract_wpcn_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=False)
    words_data = {}
    for i in range(len(feature['words'])):
        words_data[i + 1] = {
            'wid': i + 1,
            'word': feature['words'][i],
            'chk': feature['chunk'][i][1],
            'ner': feature['ner'][i][1],
            'pos': feature['pos'][i][1]
        }
    return words_data

Ejemplo n.º 36

0

Mostrar archivo

Archivo: dependency.py Proyecto: kajal94/DefectDetector

def check_neg(text, keyword):

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

    dp_list = dep_parse.split('\n')
    pattern = re.compile(r'.+?\((.+?), (.+?)\)')

    edges = []
    for dep in dp_list:
        m = pattern.search(dep)
        word1 = m.group(1).split('-')[0]
        word2 = m.group(2).split('-')[0]
        # print word1, word2
        if (word1 == keyword and word2 in neg_words) or (word1 in neg_words
                                                         and word2 == keyword):
            return 1

    return 0

Ejemplo n.º 37

0

Mostrar archivo

Archivo: try_parse_1.py Proyecto: folagit/resumatcher

def test_tree2():
    
    
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."    
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print result['syntax_tree']
    print "--------------------"
    tree2 = Tree(tree_str)
    print len(tree2)   
    print "--------------------"
    
    for item in tree2[0]:
        print type(item)
        print item

Ejemplo n.º 38

0

Mostrar archivo

Archivo: text_processor.py Proyecto: mikkergimenez/visual_text

    def process(self, text):
        annotator = Annotator()
        tokens = annotator.getAnnotations(text)
        chunks = tokens['chunk']
        ner = tokens['ner']
        pos = tokens['pos']
        srl = tokens['srl']
        words = tokens['words']

        for part in srl:
            self.assign_objects(part)
        
        returnCode = ""
        print self.objects
        for object in self.objects:
            print self.get_item(object)
            if self.get_item(object) is not None:
                returnCode += str(self.get_item(object))

        return returnCode

Ejemplo n.º 39

0

Mostrar archivo

Archivo: preprocess_text.py Proyecto: eme-ele/image-description

def preprocess(infile, outfile, posfile, index):
    annotator = Annotator()
    wnl = WordNetLemmatizer()
    o = open(outfile, 'w'); p = open(posfile, 'w'); f = open(infile)
    text = f.readlines()
    for s in text:
        s = s.strip().split("\t", index)
        # make it lower
        sent = s[index].lower()
        # remove special characters
        sent = sent.strip(string.punctuation)
        # extend contractions
        sent = re.sub(r"n't", " not", sent)
        sent = re.sub(r"'ve", " have", sent)
        sent = re.sub(r"'d", " would", sent)
        sent = re.sub(r"'ll", " will", sent)
        sent = re.sub(r"'m", " am", sent)
        sent = re.sub(r"'s", " is", sent)
        sent = re.sub(r"'re", " are", sent)

        # lematize and get POS tags
        pos = annotator.getAnnotations(sent)["pos"]
        lemmas = [wnl.lemmatize(w,'v') if t.startswith('V') else wnl.lemmatize(w, 'n') for (w,t) in pos]
        sent = " ".join(lemmas)
        pos = " ".join([x[1] for x in pos])

        out_string = ""
        pos_string = ""
        for j in range(0,index):
            out_string += s[j] + "\t"
            pos_string += s[j] + "\t"

        out_string += sent + "\n"
        pos_string += pos + "\n"
        o.write(out_string)
        p.write(pos_string)

    f.close()
    o.close()
    o.close()

Ejemplo n.º 40

0

Mostrar archivo

Archivo: srl_graph.py Proyecto: jamesoneill12/CompositionalGraphs

class srlGraph(Base):
    def __init__(self, docpath):
        super(srlGraph, self).__init__(docpath)
        self.stopwords = stopwords.words("english")
        self.annotator = Annotator()

    def srl_corpus_extraction(self, stopwords=None):

        sem_rl = self.annotator.getAnnotations(self.corpus,
                                               dep_parse=True)['srl']
        srl_corpus = [
            self.annotator.getAnnotations(doc)['srl'] for doc in self.corpus
        ]
        return srl_corpus

    def srl_document_extraction(self, document_id=0, stopwords=None):
        return self.annotator.getAnnotations(self.corpus[document_id],
                                             dep_parse=True)['srl']

    def get_doc_canon(self, document_id):
        sem_rl = self.annotator.getAnnotations(self.corpus[document_id],
                                               dep_parse=True)['srl']
        canon = [
            en.singularize(word)
            for word in str(TextBlob(sem_rl[0]['C-A1'])).split()
            if word not in stopwords
        ]

        blob = TextBlob(sem_rl[0]['A1'])
        nounPhrases = blob.noun_phrases.singularize()
        sr_verb_concept = self.annotator.getAnnotations(sem_rl[0]['A1'])['srl']

        concat_noun_concepts = set(
            sum([word.split() for word in nounPhrases], []))
        predicates = list(set(canon) - concat_noun_concepts)

        return canon

Ejemplo n.º 41

0

Mostrar archivo

Archivo: Slave_NLP.py Proyecto: dwang888/NLP_ML_py

class Worker_NLP:
    '''
    worker class for NLP
    '''
    nThreads = 4
    pool = None
    ith = 0
    tokenizer_sent = None
    practNLP_annotator = None
    sent_tokenizers = None
    annotators = None

    def __init__(self):

        self.practNLP_annotator = Annotator()
        self.tokenizer_sent = nltk.tokenize.sent_tokenize
        self.pool = Pool(self.nThreads)
        self.sent_tokenizers = []
        self.annotators = []
        for i in xrange(self.nThreads):
            self.sent_tokenizers.append(nltk.tokenize.sent_tokenize)
            self.annotators.append(Annotator())

    def getSRL(self, rawTxt):
        sents = sent_tokenize(rawTxt)
        srls = []
        for sent in sents:
            srl = self.practNLP_annotator.getAnnotations(sent)['srl']
            srls.append(srl)
        return srls

    def getSRL_parallel(self, rawTxts):
        tasks = []  #[rawTxt, tokenizer, annotator]
        for i, rawTxt in enumerate(rawTxts):
            tokenizer_tmp = self.sent_tokenizers[self.ith]
            annotator_tmp = self.annotators[self.ith]
            self.ith = (self.ith + 1) % self.nThreads
            tasks.append((rawTxt, tokenizer_tmp, annotator_tmp))

        results = []
        for task in tasks:
            result = self.pool.apply_async(compute_task_atom, args=task)
            results.append(result)

        srls_results = [item.get() for item in results]
        return srls_results

Ejemplo n.º 42

0

Mostrar archivo

Archivo: semantics.py Proyecto: Jewelryland/Youtube2Sentiments

#!usr/bin/python
'''
Python Script for SENNA functionality
Uses PracNLPTools for Sementic Role Labeling
NOT USED 
'''

import csv
import re
from practnlptools.tools import Annotator
annotator=Annotator()

print("Running Shallow Semantic Parser")
patternForSymbol = re.compile(r'(\ufeff)', re.U)
comments=[]
#reads in CSV file
with open('Dataset/dataset.csv','rb') as dataFile:
    reader = csv.reader(dataFile, delimiter=',')
    for row in reader:
        #row[0] = row[0].decode('utf-8')
        rowEdited = re.sub(patternForSymbol, '', row[0])
        comment = rowEdited if rowEdited != "" else row[0]
        sentiment = row[1]
        comments.append((comment, sentiment))


for index,comment in enumerate(comments):
	if(index<100):
		print comment[0]
		print(annotator.getAnnotations(comment[0])['srl'])
		print("==========================")

Ejemplo n.º 43

0

Mostrar archivo

Archivo: practNLPToolsTest.py Proyecto: bjut-hz/SAO

__author__ = 'hz'
from practnlptools.tools import Annotator

text = "Disclosed is an organic light-emitting diode (OLED) display panel. An OLED display panel includes a plurality of signal lines and a thin film transistor formed on a substrate, an interlayer insulating layer, a first electrode, a bank, an organic light-emitting layer, a second electrode, a first passivation layer, an organic layer, a second passivation layer and a barrier film, wherein the bank is formed to completely cover the interlayer insulating layer, and an inclination formed by side surfaces of the bank and the interlayer insulating layer is made to be gradual."

# text = "Disclosed is an organic light-emitting diode (OLED) display panel."
# semantic role labelling
text = 'Unlike the classic PPP technique, in our new approach, the ionospheric-free linear combination is not used.'
annotator = Annotator()
result = annotator.getAnnotations( text )["srl"]

print( type(result) )
print( result )

Ejemplo n.º 44

0

Mostrar archivo

Archivo: syntax_tree.py Proyecto: SGShuman/markov_sentence_generator

 def _fit(self, sent_list_of_str, dep_parse):
     '''Return annotations from a list of strings, as a list of dicts
     dep_parse is dependency parsing optional feature (takes a long time)'''
     annotator = Annotator()
     return annotator.getBatchAnnotations(sent_list_of_str, dep_parse)

Ejemplo n.º 45

0

Mostrar archivo

Archivo: try_parse_1.py Proyecto: folagit/resumatcher

def test2(): 
    sent = "There are people dying make this world a better place for you and for me."
    
    annotator=Annotator()    
    result = annotator.getAnnotations(sent,dep_parse=True)
    print  result

Ejemplo n.º 46

0

Mostrar archivo

Archivo: LabelRoles.py Proyecto: jaynagle/BigData-Wikipedia

#Pre-defining a set of desired verbs

path = "C:/Users/hp/Desktop/bigdata/txt/"
data = []
verb_file = open("./set3.txt", "rw+") 
desired_verbs = verb_file.readlines()

#Performing Stemming

stemmed_desired_verbs=[]
stemmer=stem.snowball.EnglishStemmer()

for word in desired_verbs:
    stemmed_desired_verbs.append(stemmer.stem(word))
   
annotator=Annotator()

#Implementation of Semantic Role Labelling

f = open('out.csv', 'wt')
csv.register_dialect('lineterminator',lineterminator='\n')
writer = csv.writer(f, dialect = csv.get_dialect('lineterminator'))
writer.writerow( ('A0', 'A1', 'V', 'fileName'))
for filename in os.listdir(path):
    print 'reading', filename
    text_file = open(path + filename,"r")
    file_content = text_file.readlines()
    data.append((filename,file_content))
    text_file.close()

#Annotating the striped text

Ejemplo n.º 47

0

Mostrar archivo

Archivo: relation_features.py Proyecto: Tian312/EliIE

def generate_shortestpath (sent,left_term,left_start,right_term,right_start):
    annotator = Annotator()
    #print "========before:",left_term,left_start,right_term,right_start

    '''
    left_sent=" ".join(sent.split()[:left_start])
    right_sent=" ".join(sent.split()[:right_start])
    if re.search("[A-Za-z]-[A-Za-z]",left_term):
        info=left_term.split("-")
        left_term=info[-1]
        #left_start=left_start-1
    if re.search("[A-Za-z]-[A-Za-z]",right_term):
        info=right_term.split("-")
        right_term=info[-1]
       # right_start=right_start-1
    '''
    right_term=re.sub('\(','LRB',right_term)
    right_term=re.sub('\)','RRB',right_term)
    left_term=re.sub('\(','LRB',left_term)
    left_term=re.sub('\)','RRB',left_term)
    new_left_start,new_left_term,new_right_start,new_right_term=update_loc(sent,left_start,right_start)



    '''
    # adjust start coordination # denpendency parser wil split " - " and "'s"

   # print "left: ", left_sent
   # print "right: ",right_sent

    poss=re.compile('\w+\'s')
    conj=re.compile('[A-Za-z]+\-[A-Za-z]+')
    comma=re.compile('\w,')

    result_poss_left=poss.findall(left_sent)
    result_conj_left=conj.findall(left_sent)
    result_comma_left=comma.findall(left_sent)
    left_start=left_start+len(result_conj_left)*2+len(result_poss_left)+len(result_comma_left)
    #print "left:",len(result_conj_left),len(result_poss_left)

    result_poss_right=poss.findall(right_sent)
    result_conj_right=conj.findall(right_sent)
    result_comma_right=comma.findall(right_sent)
    right_start=right_start+len(result_conj_right)*2+len(result_poss_right)+len(result_comma_right)
   # print "right:", len(result_conj_right),len(result_poss_right),len(result_comma_right)

    print "after:",left_term,left_start,right_term,right_start
    '''

    left=new_left_term+"-"+str(new_left_start)
    right=new_right_term+"-"+str(new_right_start)
    #print "=====",left,right
    #sent=re.sub('\-',' - ',sent)
    #print sent

    sent=re.sub('\(','LRB',sent)
    sent=re.sub('\)','RRB',sent)

    #print sent
    dep_parse=annotator.getAnnotations(sent, dep_parse=True)['dep_parse']
    tree=annotator.getAnnotations(sent, dep_parse=True)['syntax_tree']
    #print dep_parse
    dp_list = dep_parse.split('\n')
    pattern = re.compile(r'.+?\((.+?), (.+?)\)')
    edges = []
    for dep in dp_list:

        #print dep

        m = pattern.search(dep)
        if m:
            edges.append((m.group(1), m.group(2)))
    graph = nx.Graph(edges)
    #print right
    if right not in graph.nodes():
        print "right",left_term, right_term
        return "right"
    if left not in graph.nodes():
        print "left", left_term, right_term
        return "left"
    shorttest_path=nx.shortest_path_length(graph, source=left, target=right)
   # print
    return  shorttest_path

Ejemplo n.º 48

0

Mostrar archivo

Archivo: demo.py Proyecto: lizuyao2010/cs662

def build_d(cat_d):
    dir_path='lists'
    all_files=os.listdir(dir_path)
    for filename in all_files:
        filepath=dir_path+'/'+filename
        if os.path.isfile(filepath):
            cat=filename.strip().split('.')[0]
            with open(filepath,'r') as fp:
                for line in fp:
                    word=line.strip()
                    cat_d[word]=cat  

cat_d={}
build_d(cat_d)
annotator=Annotator()
wordnet_lemmatizer = WordNetLemmatizer()
alchemyapi = AlchemyAPI()
template={}
full_template=['AM-MOD','A0','AM-ADV', 'AM-NEG','V','C-V','AM-DIR','A1','A2','A3','A4','AM-PNC','AM-MNR','AM-LOC','AM-TMP','C-A1']
for item in full_template:
    copy_template=full_template[:]
    copy_template.remove(item)
    if 'AM-LOC' in copy_template:
        copy_template.remove('AM-LOC')
    if 'AM-TMP' in copy_template:
        copy_template.remove('AM-TMP')
    if 'AM-MNR' in copy_template:
        copy_template.remove('AM-MNR')
    template[item]=copy_template

Ejemplo n.º 49

0

Mostrar archivo

Archivo: demo.py Proyecto: lizuyao2010/cs662

#!/usr/bin/env python
from practnlptools.tools import Annotator
import sys
annotator=Annotator()

if __name__=='__main__':
    for line in sys.stdin:
        if line[0]=="#":
            continue
        line=line.strip()
        annotations=annotator.getAnnotations(line)
        print annotations['srl']

Ejemplo n.º 50

0

Mostrar archivo

Archivo: relation_features.py Proyecto: Tian312/EliIE

def update_loc(sent,left_start,right_start):
    annotator = Annotator()
    sent=re.sub('\(','LRB',sent)
    sent=re.sub('\)','RRB',sent)
    words=sent.split()
    #print words[left_start],words[right_start]
    words[left_start]=words[left_start]+'aaaaa'
    words[right_start]=words[right_start]+'bbbbb'
  #  print words

    sent=' '.join(words)
    tags=annotator.getAnnotations(sent)
   # print "===", tags

   # print "chunks:      ", tags['chunk']
    i=0
    pre_word=''
    pre_pre_word=''
    j=0
    left_term = ''
    right_term = ''


    for word in tags['chunk']:
        i+=1
        left_pattern='^(.*)aaaaa$'
        right_pattern='^(.*)bbbbb$'
        left=re.search(left_pattern,word[0])
        right=re.search(right_pattern,word[0])


        if left:
            #print "ttleft"
            left_term=left.group(1)
            left_start=i
            if left_term=='':
                left_term=pre_word
                left_start=left_start-1
                j=1
                if pre_word=='-':
                    left_term=pre_pre_word
                    left_start=left_start-1


        if right:
            #print "rightright"
            right_term=right.group(1)
            right_start=i
            if right_term=='':
                right_term=pre_word
                right_start=right_start-1
                j=2
                if pre_word=='-':
                    right_term=pre_pre_word
                    right_start=right_start-1

        pre_pre_word=pre_word
        pre_word=word[0]
    if j==1:
        if right_start>left_start:
            right_start=right_start-1
    if j==2:
        if left_start>right_start:
            left_start=left_start-1
    #print j
    #print "=++++", left_start,left_term,right_start,right_term
    return (left_start,left_term,right_start,right_term)