Beispiel #1
0
def annotate_timex(text, date, lang):

    f = open('texto.txt', 'w')
    f.write(text)
    textanotador2 = ''
    start_time = time()

    url = 'https://annotador.oeg.fi.upm.es/annotate'
    params = "{\"inputText\":\"" + text + "\",\"inputDate\":\"\",\"domain\":\"legal\",\"lan\":\"" + lang + "\",\"format\":\"timex3\"}"
    headers = {'Content-Type': 'application/json;charset=utf-8'}
    #response=requests.post(url, data=params)
    response = requests.request("POST",
                                url,
                                headers=headers,
                                data=params.encode('utf8'))
    textanotador = response.text
    print('ENTRA ANOTADOR')
    print(textanotador)

    code = response.status_code
    list_anotador = textanotador.split('|')
    print(list_anotador)

    deletes = []
    cont = 0
    for i in list_anotador:
        if ('<' in i and len(i) > 2):
            cont = cont + 1
            deletes.append(i)
            ind = list_anotador.index(i)
            list_anotador.pop(ind)
    for i in list_anotador:
        if ('<' in i and len(i) > 2):
            print(i)
            cont = cont + 1
            deletes.append(i)
            ind = list_anotador.index(i)
            list_anotador.pop(ind)

    anotador = []
    for i in list_anotador:
        anotador.append(i.strip().replace(',', ''))

    if (code != 200):
        print(
            'WARNING: Annotador is down. Temporal expressions could not be removed.'
        )
        anotador = text.split('| ')
        conts_log.error(
            'Annotador is down. Temporal expressions could not be removed.',
            code)
    else:
        elapsed_time = time() - start_time
        txt = 'AÑOTADOR, DELETE (' + str(cont) + ') NEW LIST SIZE: (' + str(
            len(anotador)) + ') TIME: (' + str(elapsed_time) + ')'
        joind = ', '.join(deletes)
        print('AÑOTADOR DELETE', cont, len(anotador), elapsed_time)
        conts_log.information(txt, 'TERMS REMOVED: ' + joind)

    return (anotador)
Beispiel #2
0
def clean_terms(termlist, lang_in):
    
    start_time=time()
    if(lang_in=="es"):
    	stop=stopwords.words('spanish')
    	file=open(sw_spanish, 'r', encoding='utf-8')
    	mystop=file.readlines()
    elif(lang_in=="en"):
    	stop=stopwords.words('english')
    	file=open(sw_english, 'r', encoding='utf-8')
    	mystop=file.readlines()

    
    clean_list = []
    cont=0
    for i in mystop:
        #print(i.strip())
        stop.append(i.strip())

    #print(stop)
    deletes=[]
    for i in termlist:
        k=i.strip(',.:')
        # print(k)
        if ((k.lower() in stop) or (k in stop)):
        	deletes.append(k)
        elif ((k.lower() not in stop) or (k not in stop)):
            clean_list.append(k.replace(',', '').replace('-', ''))

    print(deletes)
    cont=len(termlist)-len(clean_list)
    elapsed_time=time()-start_time

    txt='CLEAN_TERMS, DELETE ('+str(cont)+') NEW LIST SIZE: ('+str(len(clean_list))+') TIME: ('+str(elapsed_time)+')'
    joind=', '.join(deletes)
    conts_log.information(txt, 'TERMS REMOVED: '+joind)
    print('CLEAN_TERMS, DELETE', cont, len(clean_list), elapsed_time )
    
  
    return(clean_list)
Beispiel #3
0
def delete_numbers(list_):
	start_time=time()
	file=open('./data/numberlist_es', 'r', encoding='utf-8')
	read=file.readlines()
	cont=0
	deletes=[]
	for i in read:
		if(i[-1:]=='\n'):
			i=i[:-1]
			for j in list_:
				if(' '+i+' ' in ' '+j+' ' ):
					deletes.append(j)
					ind=list_.index(j)
					cont=cont+1
					list_.pop(ind)
	#list_.sort()
	elapsed_time=time()-start_time
	txt='NUMBERS, DELETE'+' ('+str(cont)+') NEW LIST SIZE: ('+str(len(list_))+') TIME: ('+str(elapsed_time)+')'
	joind=', '.join(deletes)
	print('NUMEROS DELETE', cont, len(list_), elapsed_time)
	conts_log.information(txt, 'TERMS REMOVED: '+joind)
	return(list_)
Beispiel #4
0
def quit_plural(valuelist):
	start_time=time()
	file=open('./data/numberlist_es', 'r', encoding='utf-8')
	read=file.readlines()
	plural=[]
	cont=0
	for i in valuelist:
		ind=valuelist.index(i)
		term=i.replace(',', '').replace('-', ' ')
		valuelist[ind]=term
		plu=''
		if('es' in term[-2:] or 's'  in term[-1:]):
			slp=term.split(' ')

			for n in read:
				if(n[:-1] in slp):
					plu=i

			if not len(plu):
				for j in slp:
					if( ('es' in j[-2:] ) and 't' not in j[-3:-2] and 'l' not in j[-3:-2] or  ('les' in j[-3:] )   ):
						plu+=' '+j[:-2]
						
						if('on' in plu[-2:]):
							plu=' '+plu[:-2]+'ón'
						if('v' in plu[-1:]):
							plu=' '+plu+'e'
						if('bl' in plu[-2:]):
							plu=' '+plu+'e'
						if('br' in plu[-2:]):
							plu=' '+plu+'e'

					elif(('s' in j[-1:]) ):
						plu+=' '+j[:-1]
						pos=slp.index(j)
						
						if(pos>0):
							bef=slp[0]
							if('n' in bef[-1:] and 'ón' not in bef[-2:]):
								
								splb=plu.split(' ')
								
								firts=splb[1]
								
								if('n' not in firts[-1:]):
									pass
								else:
									plu0=firts[:-1]
									join1=' '.join(splb[2:])
									
									plu=plu0+' '+join1
								
							

					else:
						plu+=' '+j

			ind=valuelist.index(term)
			valuelist[ind]=plu.strip()			
			cont=cont+1
	quit_plu=[]
	nuevalista=set(valuelist)
	for i in nuevalista:
		quit_plu.append(i)	

	deletes = []
	new=[]
	for i in valuelist:
	    if i not in new:
	        new.append(i)
	    else:
	    	deletes.append(i)
	#print('plurañes eliminadas ->', deletes)
	elapsed_time=time()-start_time
	txt='PLURAL, DELETE'+' ('+str(len(valuelist)-len(quit_plu))+') NEW LIST SIZE: ('+str(len(quit_plu))+') TIME: ('+str(elapsed_time)+')'
	joind=', '.join(deletes)
	print('PLURALES DELETE', len(valuelist)-len(quit_plu), len(quit_plu), elapsed_time)
	conts_log.information(txt, 'TERMS REMOVED: '+joind)
	return(quit_plu)
Beispiel #5
0
def delate_pattern(anotador):
	total=0
	deletes=[]
	start_time=time()
	lemmas_list=[]
	cont=0
	cont_inf=0
	cont_post=0
	for i in anotador:
		if(len(i)>1):
			#print( i, i.split(' ') )
			lang='es'
			pos_tagger = CoreNLPParser('https://corenlp-tool.lynx-project.eu/?pipelineLanguage='+lang, tagtype='pos')
            #si se cae el de lynx, probar con este https://corenlp.run/
			#print(i)
			tag=pos_tagger.tag(i.split(' '))
			total=total+1
			joini=i
			list_pos=[]
			spl=joini.split(' ')
			if(joini!=''):
				join_tag=''
				for t in tag:
					print(t)
					if(t[1] == 'MD' ):
						doc=nlp(t[0])
						print(doc)
						lemlist=[tok.lemma_ for tok in doc]
						lem=''.join(lemlist)
						lemmas_list.append(lem)
						if(lem==i):
							lem=t[0]
						list_pos.append('aux--'+str(lem))
						if(len(spl)==1):
							ind=anotador.index(str(i))
							anotador[ind]=str(lem)
					if(t[1] ==  'NNP'):
						list_pos.append('noun-'+str(t[0]))
					if(t[1][:1] ==  'VB'):
						cont_inf=cont_inf+1
						doc=nlp(t[0])
						print(doc)
						for tok in doc:
							l=tok.lemma_
							if(l!=t[0]):
								cont_post=cont_post+1
						lemlist=[tok.lemma_ for tok in doc]
						lem=''.join(lemlist)
						lemmas_list.append(lem)
						if(lem==i):
							lem=t[0]
						list_pos.append('verb-'+str(lem))
						if(len(spl)==1):
							ind=anotador.index(str(i))
							anotador[ind]=str(lem)
					if(t[1] ==  'RB'):
						list_pos.append('adv--'+str(t[0]))
					if(t[1] ==  'JJ'):
						list_pos.append('adj--'+str(t[0]))
					if(t[1] ==  'CC'):
						list_pos.append('sconj'+str(t[0]))
				
				spl_i=joini.split(' ')
				
				if(len(list_pos)==1):
					pos1=list_pos[0]
					if(pos1[0:4]=='adv-' ):
						term=pos1[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1

				elif(len(list_pos)==2 and len(spl_i)==2):
					pos1=list_pos[0]
					pos2=list_pos[1]
					term=''
					if(pos1[0:4]=='aux-' and pos2[0:4]=='verb'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='aux-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='verb'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='verb'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='aux-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adv-' and pos2[0:4]=='adj-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adj-' and pos2[0:4]=='adv-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adv-' and pos2[0:4]=='aux-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='aux-' and pos2[0:4]=='adv-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adv-' and pos2[0:4]=='verb'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='aux-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='adv-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adv-' and pos2[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='adv-'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='aux-' and pos2[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adj-' and pos2[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1

				elif(len(list_pos)==3 and len(spl_i)==3):
					#print(list_pos, spl_i,'-', len(list_pos), len(spl_i))
					pos1=list_pos[0]
					pos2=list_pos[1]
					pos3=list_pos[2]
					term=''
					if(pos1[0:4]=='noun' and pos2[0:4]=='verb' and pos3[0:4]=='verb'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='aux-' and pos3[0:4]=='verb'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='aux-' and pos3[0:4]=='aux-'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='verb' and pos3[0:4]=='aux-'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					
					if(pos1[0:4]=='noun' and pos2[0:4]=='verb' and pos3[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='aux-' and pos3[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='noun' and pos3[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='noun' and pos3[0:4]=='verb'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='aux-' and pos2[0:4]=='noun' and pos3[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='noun' and pos3[0:4]=='aux-'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='aux-' and pos2[0:4]=='verb' and pos3[0:4]=='noun'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='verb' and pos3[0:4]=='adj-'):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='verb' and pos3[0:4]=='noun' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='noun' and pos3[0:4]=='adj-' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='aux-' and pos3[0:4]=='adj-' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='adv-' and pos3[0:4]=='adj-' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adj-' and pos2[0:4]=='adv-' and pos3[0:4]=='adj-' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='noun' and pos2[0:4]=='adv-' and pos3[0:4]=='scon' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adj-' and pos2[0:4]=='scon' and pos3[0:4]=='adv-' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='aux-' and pos2[0:4]=='noun' and pos3[0:4]=='adj-' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='verb' and pos2[0:4]=='verb' and pos3[0:4]=='verb' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1
					if(pos1[0:4]=='adj-' and pos2[0:4]=='noun' and pos3[0:4]=='adj-' and joini in anotador):
						term=pos1[5:]+' '+pos2[5:]+' '+pos3[5:]
						deletes.append(joini)
						ind=anotador.index(joini)
						#anotador.pop(ind)
						cont=cont+1

	for i in deletes:
		if(i in anotador):
			ind=anotador.index(i)
			anotador.pop(ind)
			
	
	elapsed_time=time()-start_time
	txt='PATRONES, DELETE'+' ('+str(cont)+') NEW LIST SIZE: ('+str(len(anotador))+') TIME: ('+str(elapsed_time)+')'
	joind=', '.join(deletes)
	print('PATRONES DELETE', cont, len(anotador), elapsed_time)
	conts_log.information(txt, 'TERMS REMOVED: '+joind)
	return(anotador)
Beispiel #6
0
def delete_pattern(anotador, pos_tagger):
    total = 0
    deletes = []
    start_time = time()
    lemmas_list = []
    cont = 0
    cont_inf = 0
    cont_post = 0
    for i in anotador:
        print('this is i')
        print(i)
        if (len(i) > 1):
            #print( i, i.split(' ') )
            #pos_tagger = CoreNLPParser('https://corenlp.run/', tagtype='pos')
            #si se cae el de lynx, probar con este https://corenlp.run/
            #print(i)
            doc = pos_tagger(i)
            #print(doc)
            sent = doc.sentences[0]
            word = sent.words
            tag = []
            for token in word:
                pos = token.upos
                term = token.text
                tupla = (term, pos)
                tag.append(tupla)
                print(token.text)
                print(pos)
            #tag=pos_tagger.tag(i.split(' '))
            print('this is tag ')
            print(tag)
            total = total + 1
            joini = i
            list_pos = []
            spl = joini.split(' ')
            if (joini != ''):
                join_tag = ''
                for t in tag:
                    print('this is t')
                    print(t)
                    if (t[1] == 'AUX'):
                        doc = nlp(t[0])
                        lemlist = [tok.lemma_ for tok in doc]
                        lem = ''.join(lemlist)
                        lemmas_list.append(lem)
                        if (lem == i):
                            lem = t[0]
                        list_pos.append('aux--' + str(lem))
                        if (len(spl) == 1):
                            ind = anotador.index(str(i))
                            anotador[ind] = str(lem)
                    if (t[1] == 'NOUN'):
                        list_pos.append('noun-' + str(t[0]))
                    if (t[1] == 'VERB'):
                        cont_inf = cont_inf + 1
                        doc = nlp(t[0])
                        for tok in doc:
                            l = tok.lemma_
                            if (l != t[0]):
                                cont_post = cont_post + 1
                        lemlist = [tok.lemma_ for tok in doc]
                        lem = ''.join(lemlist)
                        lemmas_list.append(lem)
                        if (lem == i):
                            lem = t[0]
                        list_pos.append('verb-' + str(lem))
                        if (len(spl) == 1):
                            ind = anotador.index(str(i))
                            anotador[ind] = str(lem)
                    if (t[1] == 'ADV'):
                        list_pos.append('adv--' + str(t[0]))
                    if (t[1] == 'ADJ'):
                        list_pos.append('adj--' + str(t[0]))
                    if (t[1] == 'SCONJ'):
                        list_pos.append('sconj' + str(t[0]))

                spl_i = joini.split(' ')

                if (len(list_pos) == 1):
                    pos1 = list_pos[0]
                    if (pos1[0:4] == 'adv-'):
                        term = pos1[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1

                elif (len(list_pos) == 2 and len(spl_i) == 2):
                    pos1 = list_pos[0]
                    pos2 = list_pos[1]
                    term = ''
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'adj-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1

                elif (len(list_pos) == 3 and len(spl_i) == 3):
                    #print(list_pos, spl_i,'-', len(list_pos), len(spl_i))
                    pos1 = list_pos[0]
                    pos2 = list_pos[1]
                    pos3 = list_pos[2]
                    term = ''
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1

                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'adj-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'noun' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'adj-' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'adj-' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-'
                            and pos3[0:4] == 'adj-' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'adv-'
                            and pos3[0:4] == 'adj-' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-'
                            and pos3[0:4] == 'scon' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'scon'
                            and pos3[0:4] == 'adv-' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'adj-' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'verb' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'adj-' and joini in anotador):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = anotador.index(joini)
                        #anotador.pop(ind)
                        cont = cont + 1

    for i in deletes:
        if (i in anotador):
            ind = anotador.index(i)
            anotador.pop(ind)

    elapsed_time = time() - start_time
    txt = 'PATRONES, DELETE' + ' (' + str(cont) + ') NEW LIST SIZE: (' + str(
        len(anotador)) + ') TIME: (' + str(elapsed_time) + ')'
    joind = ', '.join(deletes)
    print('PATRONES DELETE', cont, len(anotador), elapsed_time)
    conts_log.information(txt, 'TERMS REMOVED: ' + joind)
    return (anotador)
Beispiel #7
0
def wsidFunction(termIn, listcontext,   definitions):
    #print(termIn,'|', context.lower(),'|',   definitions)
    start_time=time()
    conts_log.information('-----WSID----','')
    defiMax=str
    uri_max=str
    index_max=0
    code=0
    index_max_list=list()
    posDefs=list()
    pesos_max_list=list()
    uri_max_list=list()
    if(listcontext):
        cont=0
        for s in definitions[0]:
            conts_log.information('Senses: '+s,'')
            #print('Senses: ',s)

        for context in listcontext:
            pesos=[]
            context=context.lower()
            conts_log.information('Context: '+context,'')
            termIn=termIn.lower()
            start=context.index(termIn)
            longTerm=len(termIn)
            end=context.index(termIn.lower())+longTerm
            
            listdef=definitions[0]
            listIde=definitions[1]
            
            #print('CONTEXT---',cont,context)
            #print('START---', start)
            #print('END---', end)
            #print('SENSES---',definitions[0])
            #print('----Entrando WSDI----')
            auth_token = getToken()
            #print(auth_token)
            hed = {
                   'Authorization': 'Bearer ' + auth_token, 
                   'accept': 'application/json',
                   'Content-Type': 'application/json'
                  }
                
            url_lkgp_status='http://entity-linking-lynx.apps.cybly.cloud/disambiguate_demo?'
            params={'context': context, 'start_ind': start, 'end_ind': end,  'senses': definitions[0]}
            response = requests.post(url_lkgp_status,params=params,headers =hed)
            #response = requests.get('https://apim-88-staging.cloud.itandtel.at/api/entity-linking', params=params)
            code=response.status_code
            #code=200
            #print('CODE WSID',code)
            #print('response', response)
            if(code!=200):
                conts_log.error('Wsid code: ', code)
            req = response.request

            command = "curl -X {method} -H {headers} -d '{data}' '{uri}'"
            method = req.method
            uri = req.url
            data = req.body
            headers = ['"{0}: {1}"'.format(k, v) for k, v in req.headers.items()]
            headers = " -H ".join(headers)
            #print(command.format(method=method, headers=headers, data=data, uri=uri))
            
            try:
                pesos=response.json()
                #print(pesos)
                if(code==200):
                    peso_max = max(pesos)#se obtiene el peso maximo 
                    #print('1. ', peso_max)
                    index_max=pesos.index(peso_max)#se obtiene el indice del peso maximo
                    #print('2. ', index_max)
                    index_max_list.append(index_max)#lista con indices de pesos maximos
                    #print('3. ', index_max_list)
                    pesos_max_list.append(pesos[index_max])#lista con pesos maximos 
                    #print('4. ', pesos_max_list)

                    if(len(listdef)):
                        defiMax=listdef[index_max]#definicion maxima 
                        #print('5. ', defiMax)
                        posDefs.append(defiMax)#lista con definiciones maximas
                        #print('6. ', posDefs)
                    
                    if(len(listIde)):
                        uri_max=listIde[index_max]#uri maximo
                        uri_max_list.append(uri_max)#lista con uri maximas
                        #print('7. ', uri_max)
                        #print('8. ', uri_max_list)
                    
            except json.decoder.JSONDecodeError:
                pass
            cont=cont+1
    #print(index_max, defiMax, uri_max)
    max1=int
    valid=str
    valid_context=str
    if(len(index_max_list)):
        max1=max(index_max_list)#maximo de todos los pesos maximos 
        index_max1=index_max_list.index(max1)
        valid=posDefs[index_max1]
        uri_max=uri_max_list[index_max1]

        #print('9. ', max1, index_max1, valid, uri_max)
        max2=max(pesos_max_list)
        index_max2=pesos_max_list.index(max2)
        #contx=pesos_max_list[max2]
        valid_context=listcontext[index_max2]

        #print('10. ', max2, index_max2, valid_context)


        #print('--------->',max1, valid, uri_max, valid_context)
    #print('Result context: '+str(valid_context), 'Result sense: '+str(valid))
    conts_log.information('Result context: '+str(valid_context), 'Result sense: '+str(valid))
    elapsed_time=time()-start_time
    conts_log.information('Time wsid: '+str(elapsed_time),'')
    conts_log.information('-------------','')
    return(valid, uri_max,code, valid_context)