Ejemplo n.º 1
0
    def __init__(self):

        self.practNLP_annotator = Annotator()
        self.tokenizer_sent = nltk.tokenize.sent_tokenize
        self.pool = Pool(self.nThreads)
        self.sent_tokenizers = []
        self.annotators = []
        for i in xrange(self.nThreads):
            self.sent_tokenizers.append(nltk.tokenize.sent_tokenize)
            self.annotators.append(Annotator())
Ejemplo n.º 2
0
def test_tree4():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    sent = "B.S. in Computer Science , a related degree or its equivalent"
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."

    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"

    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print
    print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=", tree.label()
    tree.draw()
Ejemplo n.º 3
0
def get_shortest_path(a, b):
	text = a + b 

	annotator = Annotator()
	dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

	dp_list = dep_parse.split('\n')
	pattern = re.compile(r'.+?\((.+?), (.+?)\)')
	edges = []
	
	for dep in dp_list:
		m = pattern.search(dep)
		edges.append((m.group(1), m.group(2)))
	
	graph = nx.Graph(edges)  
	
	shortest_paths = [] 
	
	a = a.strip()
	b = b.strip()
	
	a = a.split()
	b = b.split()
	
	for i in a: 
		for j in b: 
			shortest_paths.append(nx.shortest_path_length(graph, source=i, target=j))
	
	print(shortest_paths)
def extract_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=True)

    if feature['dep_parse'] == '':
        return -2
    a = feature['dep_parse'].split('\n')
    words_data = {}
    d = []
    for i in a:
        dep = re.sub(r'^[^\(]+\(|\)$', '', i)
        try:
            p, c = dep.split(', ')
        except ValueError:
            pass
        try:
            t1 = p.split('-')
            pid = int(t1[len(t1) - 1])
            t2 = c.split('-')
            wid = int(t2[len(t2) - 1])
        except ValueError:
            if re.match('[\d]+\'', t1[len(t1) - 1]):
                pid = int(re.sub(r'\'', '', t1[len(t1) - 1])) + 0.1
                t2 = c.split('-')
                wid = int(t2[len(t2) - 1])
            elif re.match('[\d]+\'', t2[len(t2) - 1]):
                pass
            continue
        d.append((wid, pid))
    t1 = [id for id in d]
    d, _ = remove_dep(t1)
    for wid, pid in d:
        add_ids(words_data, wid, pid)
    for i in range(len(feature['words'])):
        if i + 1 not in words_data:
            words_data[i + 1] = {
                'wid': i + 1,
                'pid': -1,
                'word': feature['words'][i],
                'chk': feature['chunk'][i][1],
                'ner': feature['ner'][i][1],
                'pos': feature['pos'][i][1]
            }
        elif i + 1 in words_data:
            words_data[i + 1]['word'] = feature['words'][i]
            words_data[i + 1]['chk'] = feature['chunk'][i][1]
            words_data[i + 1]['ner'] = feature['ner'][i][1]
            words_data[i + 1]['pos'] = feature['pos'][i][1]
    words_data['syntax_tree'] = feature['syntax_tree']
    words_data['tree'] = feature['tree']
    words_data['verbs'] = feature['verbs']
    words_data['srl'] = feature['srl']
    # Global.accepted += 1
    return words_data
Ejemplo n.º 5
0
def rawfile_to_sentencefile_dir():

    indir = sys.argv[1]
    outdir = sys.argv[2]
    counter = 1
    try:
        os.makedirs(outdir)
    except:
        print('dir existed')
        pass
    time_start = time.time()
    annotator = Annotator()
    a = os.listdir(indir)
    part_data = os.listdir(indir)[int(sys.argv[3]):int(sys.argv[4])] # devide data into several parts,we need to set the start-end
    #part_data = os.listdir(indir)[0:8]
    for fname in part_data:
        if os.path.splitext(fname)[1] == '.summary':
           if not os.path.exists(os.path.join(outdir, fname.split('.')[0]+'.summary'+'.new')): #determine whether the file has been processed
                print(fname)
                #time_start = time.time()
                rawfile_to_sentencefile(annotator,os.path.join(indir, fname), os.path.join(outdir, fname+'.new'))
                counter = counter+1
                #time_end = time.time()
                #print('totally cost', time_end - time_start, 'Number', counter)
           else:
               print('skip', fname )

    time_end = time.time()
    print('totally cost: ', time_end - time_start, 'file number: ', counter-1)
def compute_POS(line):
    annotator = Annotator()
    pos = annotator.getAnnotations(line)['pos']
    pos_tag = []
    for p in pos:
        pos_tag.append(p[1])
    return pos_tag
Ejemplo n.º 7
0
    def getword2vec(self, raw_text):
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()
        counter = 0
        doc_vec = []
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            srl = list(annotator.getAnnotations(sentence)['srl'])
            word2vec = []
            # get the event structure for each sentence
            for s in srl:
                if 'V' in s:
                    # print s['V']
                    word2vec = self.getvector(s['V'])
                    # print word2vec
                else:
                    print 'No verb found in sentence'
                    return
                if 'A0' in s:
                    # print s['A0']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A0']))

                if 'A1' in s:
                    # print s['A1']
                    word2vec = self.addVectors(word2vec,
                                               self.getvector(s['A1']))
            if counter == 0:
                doc_vec = word2vec
            else:
                doc_vec = self.addVectors(doc_vec, word2vec)
            counter = counter + 1
Ejemplo n.º 8
0
def test1():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    result = annotator.getAnnotations(sent)

    #print result
    print type(result)
    print result.keys()
Ejemplo n.º 9
0
 def annotation(self, n):
     parsed_heading = self.get_parsed_heading()
     annotator = Annotator()
     try:
         annotation = annotator.getAnnotations(parsed_heading[n])
         return annotation
     except:
         pass
Ejemplo n.º 10
0
def test_deep():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent, dep_parse=True)

    print result['dep_parse']
Ejemplo n.º 11
0
 def srl(self, sen_dest):
     ant = Annotator()
     if sen_dest.upper().split()[0] == "UP":
         v = sen_dest.upper().split()
         v[0] = "RAISE"
         sen_dest = str(v)
     sen_srl = ant.getAnnotations(sen_dest)['syntax_tree']
     return sen_srl
def get_annotations(question):
    annotator = Annotator()
    annotations = annotator.getAnnotations(question)
    srl = annotations['srl']
    verbs = annotations['verbs']
    ner = annotations['ner']
    chunk = annotations['chunk']
    return srl, verbs, ner, chunk
Ejemplo n.º 13
0
def semantic_role_label():
    lemmatizer = WordNetLemmatizer()
    verbs_target = ["ensnare", "infect", "use", "target"]
    verbs_tool = ["attack"]

    # sent = "Mirai-Based Masuta Botnet Weaponizes Old Router Vulnerability. By Ionut Arghire on January 24, 2018. inShare. A new piece of malware based on Mirai's publicly released source code has been observed at large, ensnaring devices into a botnet, targeted Internet of Things. Known as Masuta, the botnet has at least two variants at large, and is believed to be the work of a well-known IoT threat actor, NewSky Security says. What?s also unique to the botnet is that it exploits an old router vulnerability, being the first threat known to weaponize it in a botnet campaign. Masuta (Japanese for master) botnet?s source code was found on an invite only dark forum. The malware?s configuration file, the researchers discovered, uses a different seed of the cipher key compared to Mirai, having the strings in the configuration files XORed by 0x45. Thus, the researchers discovered that it uses the domain nexusiotsolutions(dot)net, the command and control (C&C) server that Nexus Zeta, the individual involved in the recent Satori attacks, uses. The domain was registered using the nexuszeta1337@gmail(.)com email address. Thus, NewSky Security suggests that Nexus Zeta has been involved in the creation of the Masuta botnet, in addition to building Satori, the Mirai variant that has been wreaking havoc over the past couple of months. In fact, Masuta isn?t new either, and attacks involving it have been steadily increasing since September, and the botnet?s standard variant has been observed using several known/weak/default credentials to compromise IoT devices. An evolved variant of Masuta, called PureMasuta, contains the most typical of Mirai style code, and a list of weak credentials to use. What makes this malware variant stand out, however, is its usage of EDB 38722 D-Link exploit. The exploit PureMasuta uses resides in the HNAP (Home Network Administration Protocol), which is based on the SOAP protocol. It is possible to craft a SOAP query to bypass authentication by using hxxp://purenetworks.com/HNAP1/GetDeviceSettings, and improper string handling can lead to arbitrary code execution, and an attacker can abuse this combination of issues to run code on targeted devices. What the botnet does is to download a shell script from the C&C server and run it. Thus, the malware author first bypasses authentication and then executes code on the targeted devices. The PureMasuta variant uses the same C&C server (93.174.93.63) as the original Masuta variant, which led the researchers to believe it is the evolved creation of the same threat actor. Nexus Zeta is no stranger when it comes to implementing SOAP related exploits. The threat actor has already been observed in implementing two other known SOAP related exploits, CVE-2014-8361 and CVE-2017-17215 in his Satori botnet project, NewSky Security notes. Thus, the TR-069 bug and EDB 38722 are the third and fourth SOAP related exploits abused by IoT botnets. Protocol exploits are more desirable for threat actors as they usually have a wider scope. A protocol can be implemented by various vendors/models and a bug in the protocol itself can get carried on to a wider range of devices, the researchers conclude."

    # sent = "Mirai, the infamous botnet used in the recent massive distributed denial of service (DDoS) attacks against Brian Krebs' blog and Dyn's DNS infrastructure, has ensnared Internet of Things (IoT) devices in 164 countries, researchers say."

    if len(sys.argv) != 2:
        print("NOPE")
        exit()

    fh = open(sys.argv[1], "r")
    sent = fh.read()
    # sent = sys.argv[1]

    target = ""
    tools = ""

    for s in [i.strip() for i in sent.split(".")]:
        a = Annotator()
        b = a.getAnnotations(s.encode('utf-8'))

        dictlist = b['srl']
        for dict in dictlist:
            if 'V' in dict:
                if lemmatizer.lemmatize(dict["V"].lower(),
                                        'v') in verbs_target:
                    temp1 = temp2 = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp1 += dict['A0']
                    # temp += " :|: "
                    if "A1" in dict and not dict['A1'] == "":
                        temp2 += dict['A1']
                    if not temp1 == "":
                        temp1 = getTools(temp1)
                        tools += temp1 + ":-----:"
                    if not temp2 == "":
                        temp2 = getTargets(temp2)
                        target += temp2 + ":-----:"
                if lemmatizer.lemmatize(dict["V"].lower(), 'v') in verbs_tool:
                    temp = ""
                    if "A0" in dict and not dict['A0'] == "":
                        temp += dict['A0']
                    # temp += "|"
                    # if "A1" in dict:
                    #     temp += dict['A1']
                    if not temp == "":
                        temp = getTools(temp)
                        tools += temp + ":-----:"

        # print("SemanticRoleLabel :::: {}".format(b['srl']))
        # print("2nd:\n{}".format([x, y] for x,y in [b['ner']]))

    print("Target :::: " + target)
    print("Tools :::: " + tools)
Ejemplo n.º 14
0
def complexQuery(term):
    #test = json.load(urllib2.urlopen("http://www.freesound.org/apiv2/search/text/?query="+term+"&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
    #test2 = json.load(urllib2.urlopen("https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="+term))

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(term, dep_parse=True)['dep_parse']
    dp_list = dep_parse.split('\n')

    #spotlightTerms = WordNet.spotlightSearch(term)
    #print "spotlight terms %s" %spotlightTerms
    #print "dp list  %s" %dp_list

    spotlightTerms = spotlight.annotate(
        'http://spotlight.sztaki.hu:2222/rest/annotate',
        term,
        confidence=0.3,
        support=20,
        spotter='Default')
    #print term, '\t', spotlightTerms[1].get('URI')
    #print spotlightTerms[0].get('URI')
    secondDep = ""
    query = []

    for prep in dp_list:
        elementPrep = "prep"
        if elementPrep in prep:
            print("We found preposition1: %s" %
                  prep[prep.find("_") + 1:prep.find("(")])
            prepType = prep[prep.find("_") + 1:prep.find("(")]
            print("We found preposition2: %s" %
                  prep[prep.find(" ") + 1:prep.find(")")])
            secondDep = prep[prep.find(" ") + 1:prep.find(")")].split("-")
            print secondDep[0]
            query.append(prepType)
            query.append(secondDep[0])
            if prepType == "like":
                results = DBPedia.dpbediaQuery(prepType, secondDep[0])
            else:
                results = DBPedia.dpbediaQuery(prepType,
                                               spotlightTerms[1].get('URI'))
            print results

    for query in results:

        test = json.load(
            urllib2.urlopen(
                "http://www.freesound.org/apiv2/search/text/?query=" + query +
                "&token=06mS7W2OiXidVC2tQ4ikMfe3nomU7rBptaJgBCvp"))
        test2 = json.load(
            urllib2.urlopen(
                "https://api.jamendo.com/v3.0/tracks/?client_id=4cb8fab9&format=jsonpretty&name="
                + query))

    print(test)
    #print(test2)

    return test, test2
def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        Chunk_Tags.append(chunk)
    return Chunk_Tags
Ejemplo n.º 16
0
def compute_Chunks(corpus):
    Chunk_Tags = []
    annotator = Annotator()
    #for sentence in corpus:
    chunks = annotator.getAnnotations(corpus)['chunk']
    chunk = ""
    for elem in chunks:
        chunk = chunk + elem[1] + " "
    # print chunk  To see what these chucks are
    Chunk_Tags.append(chunk)
    return Chunk_Tags
Ejemplo n.º 17
0
    def SRLAnnotation( self, sentence ):
        """
        Use SENNA library to perform SRL(semantic role labelling) on specific sentence.

        :param sentence: the specific sentence to be handled
        :type sentence: str
        :return:
        :rtype: list({})
        """
        annotator = Annotator()
        return annotator.getAnnotations( sentence )["srl"]
Ejemplo n.º 18
0
def test3():
    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    result = annotator.getAnnotations(sent)

    print result["pos"]
    print result['ner']
    print result['chunk']
    print result['verbs']
    print result['srl']
Ejemplo n.º 19
0
def draw_tree():

    annotator = Annotator()
    result = annotator.getAnnotations(sent11)
    tree_str = result['syntax_tree']
    print
    # print tree_str

    tree = Tree.fromstring(tree_str)[0]
    print tree.pprint()
    print
    print "Root label=", tree.label()
    tree.draw()
Ejemplo n.º 20
0
def compute_NER(corpus):
    NER = []
    fi = open(read_property('NER_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        ners = annotator.getAnnotations(sentence)['ner']
        ner = ""
        for elem in ners:
            ner = ner + elem[1] + " "
        print ner
        fi.write(ner + "\n")
        NER.append(ner)
    return NER
Ejemplo n.º 21
0
def compute_Chunks(corpus):
    #Chunk_Tags=[]
    fi = open(read_property('Chunk_features_test_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        print chunk
        fi.write(chunk + "\n")
        #Chunk_Tags.append(chunk)
    #print "The bag of words for Chunks is ",Chunk_Tags
    fi.close()
Ejemplo n.º 22
0
def compute_POS_Tags(corpus):
    #POS=[]
    fi = open(read_property('POS_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        pos_seq = annotator.getAnnotations(sentence)['pos']
        #print pos_seq
        pos_tags = ""
        for pos in pos_seq:
            pos_tags = pos_tags + pos[1] + " "
        fi.write(pos_tags + "\n")
        print pos_tags  ###############
        #POS.append(pos_tags)
    #print "The bag of words of POS is ",POS
    fi.close()
Ejemplo n.º 23
0
    def __init__(self, sentence, question, answer, nlp, srl=None):
        if srl == None:
            self.ascii_sentence = unicodedata.normalize('NFKD',
                                                        sentence).encode(
                                                            'ascii', 'ignore')
            self.ascii_question = unicodedata.normalize('NFKD',
                                                        question).encode(
                                                            'ascii', 'ignore')
            self.ascii_answer = unicodedata.normalize('NFKD', answer).encode(
                'ascii', 'ignore')
            self.annotator = Annotator()
            self.srl = self.annotator.getAnnotations(
                self.ascii_sentence)['srl']
            self.answer_srl_label = self.set_answer_srl_label()
        else:
            self.srl = srl

        self.nlp = nlp
        self.raw_sentence = sentence
        self.raw_question = question
        self.raw_answer = answer
        self.spacy_sent = self.nlp(self.raw_sentence)
        self.spacy_ques = self.nlp(self.raw_question)
        self.answer_length = self.set_answer_length()
        self.spacy_answer = self.set_spacy_answer()
        self.answer_pos = self.set_answer_pos()
        self.answer_ner = self.set_answer_ner()
        self.answer_ner_iob = self.set_answer_ner_iob()
        self.answer_depth = self.set_answer_depth()
        self.answer_word_count = self.set_answer_word_count()
        self.all_pos_tags = [
            'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
            'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB',
            'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN',
            'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'PUNCT'
        ]
        self.all_ner_tags = [
            'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT',
            'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
            'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'
        ]
        self.all_srl_labels = [
            'V', 'A0', 'A1', 'A2', 'C-arg', 'R-arg', 'AM-ADV', 'AM-DIR',
            'AM-DIS', 'AM-EXT', 'AM-LOC', 'AM-MNR', 'AM-MOD', 'AM-NEG',
            'AM-PNC', 'AM-PRD', 'AM-PRP', 'AM-REC', 'AM-TMP'
        ]
Ejemplo n.º 24
0
def test_tree2():

    annotator = Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy."
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print result['syntax_tree']
    print "--------------------"
    tree2 = Tree(tree_str)
    print len(tree2)
    print "--------------------"

    for item in tree2[0]:
        print type(item)
        print item
Ejemplo n.º 25
0
    def get_filtered_events(self, doc_id, raw_text, entity):

        filtered_events = []
        sentences = nltk.sent_tokenize(raw_text)
        annotator = Annotator()

        # for each sentence determine if it pertains to the entity
        for sentence in sentences:
            # get semantic role labelling data for each sentence
            events = self.get_filtered_verbs_with_vecs(
                doc_id, list(annotator.getAnnotations(sentence)['srl']),
                entity)
            # print events.__len__()
            if events.__len__() > 0:
                filtered_events.extend(events)

        # return list of events
        return filtered_events
Ejemplo n.º 26
0
 def readfile(self, num_of_lines):
     n = 0
     annotator = Annotator()
     with open(self.path + self.file_name) as f:
         reader = csv.reader(f)
         for l in reader:
             if n < num_of_lines:
                 line = l[0].split("\t")
                 heading = line[0]
                 true_passage = line[1]
                 false_passage = line[2:]
                 document = Document(heading, true_passage, false_passage)
                 anno = annotator.getAnnotations(
                     document.get_parsed_heading()[0])
                 pprint(anno)
                 n += 1
             else:
                 break
Ejemplo n.º 27
0
def extract_wpcn_feature_using_senna(line):
    '''
    Takes line in and data out
    :param line: an english sentence
    :return:
        data : [word, number, pos, chunk_info, ner_info, parent_number]
    '''
    annotator = Annotator()
    feature = annotator.getAnnotations(line, dep_parse=False)
    words_data = {}
    for i in range(len(feature['words'])):
        words_data[i + 1] = {
            'wid': i + 1,
            'word': feature['words'][i],
            'chk': feature['chunk'][i][1],
            'ner': feature['ner'][i][1],
            'pos': feature['pos'][i][1]
        }
    return words_data
Ejemplo n.º 28
0
def check_neg(text, keyword):

    annotator = Annotator()
    dep_parse = annotator.getAnnotations(text, dep_parse=True)['dep_parse']

    dp_list = dep_parse.split('\n')
    pattern = re.compile(r'.+?\((.+?), (.+?)\)')

    edges = []
    for dep in dp_list:
        m = pattern.search(dep)
        word1 = m.group(1).split('-')[0]
        word2 = m.group(2).split('-')[0]
        # print word1, word2
        if (word1 == keyword and word2 in neg_words) or (word1 in neg_words
                                                         and word2 == keyword):
            return 1

    return 0
Ejemplo n.º 29
0
def readingClues(file_path):

    count = 0
    anntator = Annotator()
    result = []
    with open(file_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            print count
            print row['rawClue']
            try:
                features = getNumericValue(row['rawClue'], anntator)
                features['curID'] = row['curID']
                features['rawClue'] = row['rawClue']
                features['target'] = row['target']
                features['source'] = row['source']
                features['type'] = row['type']
                features['result'] = row['result']
                features['response'] = row['response']
                features['avgCOPMI'] = row['avgCOPMI']
                features['maxPMI'] = row['maxPMI']
                #features['class'] = row['class']
                result.append(features)
                count += 1
                if count % 100 == 0:
                    dicToCVS(
                        result,
                        "../Data/featureExtractedDic/conversationparserWithAllFeatures.txt"
                    )
                    result = []

            except:
                continue

        #write the remaining results into the file.
        dicToCVS(result, "../Data/featureExtractedDic/testing.txt")
Ejemplo n.º 30
0
from practnlptools.tools import Annotator

annotator = Annotator()

fp = open("input_captions.txt", "r")
fw = open("output_captions.txt", "w")
data = fp.read().strip().split("\n")

for caption in data:
    out = annotator.getAnnotations(caption, dep_parse=True)
    pos = out['pos']
    ner = out['ner']
    srl = out['srl']

    replace_phrase = ''
    for i in srl:
        if 'A0' in i:
            replace_phrase = i['A0']
        elif 'A1' in i:
            replace_phrase = i['A1']
        break

    if len(replace_phrase) == 0:
        continue

    if caption.startswith(replace_phrase) == False:
        continue

    filler1 = 'What'
    filler2 = 'What is'
    filler3 = 'Who'