Esempio n. 1
0
def disambiguateEntity(candidates, weights,resolvedEntities, factorWeights, maxCount, currentId, limit):
	if len(candidates):
		max_score=limit
		aging_factor=0.01
		best_candidate=None
		if currentId in resolvedEntities:
			del resolvedEntities[str(currentId)]
		candidates=normalizeTPs(candidates)
		for cand in candidates:
			candidate=cand[0]
			ss=cand[1]["ss"]
			associativeness=cand[1]["count"]/maxCount
#			normalizationFactor=maxCoherence(weights, min(10,len(resolvedEntities)))
			normalizationFactor=1.0
			coherence=computeCoherence(candidate, resolvedEntities, weights)/normalizationFactor
			lastId=getPreviousOccurrence(utils.normalizeURL(candidate), resolvedEntities, currentId-1)
			recency=0.0
			if lastId>-1:				
				age=abs(currentId-lastId)
				recency=(1-aging_factor)**age
			temporalPopularity=cand[1]["tp"]
			score=factorWeights['wss']*ss+factorWeights['wc']*coherence+factorWeights["wa"]*associativeness+factorWeights['wr']*recency+factorWeights['wt']*temporalPopularity
			#print("%s\tSCORE: %f\tSS: %f\tCoh: %f\tAssoc: %f\tRecency: %f" % (cand[0], score, ss, coherence, associativeness, recency))
			if score>limit and (score>max_score or (score==max_score and len(candidate)<len(best_candidate))) and not isDisambiguation(candidate):
				max_score=score
				best_candidate=candidate
		return utils.normalizeURL(best_candidate), max_score
	else:
		return "--NME--", 1.0
Esempio n. 2
0
    def _getEventDetailsFromOverview(self, event, n):
        url = utils.normalizeURL(base=self.DOMAIN, url=self.URL)
        title = utils.Soup.getTextAt(event, self.SELECTOR_EVENT_TITLE)

        startStr = utils.Soup.getTextAt(event, self.SELECTOR_EVENT_STARTTIME)
        if not startStr:
            logger.error(f'Cannot find start time on page {url}')
            return None
        startTime = datetime.strptime(startStr, self.FORMAT_EVENT_DATE)
        startTime = utils.normalizeDate(startTime,
                                        self.config['defaults']['timezone'])

        rawEvent = RawEvent(self.IDENTIFIER, f'{url}#{n}', title, startTime)

        location = utils.Soup.getTextAt(event, self.SELECTOR_EVENT_LOCATION)
        rawEvent.setLocation(location)

        majors = utils.Soup.getTextAt(event, self.SELECTOR_MAJORS)
        if majors.lower().startswith('majors'):
            majors = majors[6:]
        rawEvent.setAudience(majors)

        description, links = utils.Soup.tokenizeElemAt(event,
                                                       self.SELECTOR_EVENT_DESCRIPTION,
                                                       base=url)
        links = map(utils.normalizeURL(base=self.URL), links)
        links = set(filter(None, links))

        rawEvent.setDescription(description)
        rawEvent.setLinks(links)

        return rawEvent
    def getEventList(self):
        url = utils.normalizeURL(base=self.DOMAIN, url=self.URL)
        reqData = self.LIST_REQUEST_DATA.copy()
        backlogTime = datetime.now() - self.config['backlog']
        backlogTime = self.config['defaults']['timezone'].localize(backlogTime)
        reqData['endsAfter'] = backlogTime.replace(microsecond=0).isoformat()
        res = self.requester.fetchURL(url, data=reqData, json=True)

        events = set()
        for event in res['value']:
            eventURL = utils.normalizeURL(base=self.DOMAIN, url=self.EVENT_URL)
            eventURL = eventURL.format(event['id'])

            startTime = datetime.fromisoformat(event['startsOn'])
            startTime = utils.normalizeDate(startTime,
                                            self.config['defaults']['timezone'])

            rawEvent = RawEvent(self.IDENTIFIER,
                                eventURL,
                                event['name'],
                                startTime)

            if 'endsOn' in event:
                endTime = datetime.fromisoformat(event['endsOn'])
                endTime = utils.normalizeDate(endTime,
                                              self.config['defaults']['timezone'])
                rawEvent.setEnd(endTime)

            rawEvent.setLocation(event['location'])
            rawEvent.setExtras(', '.join(event.get('benefitNames', ())))

            soup = BeautifulSoup(event['description'], 'html.parser')
            description, links = utils.HTMLToText.tokenizeSoup(soup,
                                                               base=url,
                                                               customStyle=self.DESCRIPTION_STYLE)
            links = map(utils.normalizeURL(base=eventURL), links)
            links = set(filter(None, links))

            rawEvent.setDescription(description)
            rawEvent.setLinks(links)

            rawEvent.setStatus(event['status'])
            events.add(rawEvent)

        return events
Esempio n. 4
0
def appendViews(c, timePickle):
    m = 0
    for cand in c:
        #print(cand)
        entity = utils.normalizeURL(cand[0])
        view = 0.0
        if entity in timePickle:
            view = timePickle[entity]
        cand[1]['tp'] = view
    return c
Esempio n. 5
0
def appendViews(c, timePickle):
	m=0
	for cand in c:
		#print(cand)
		entity=utils.normalizeURL(cand[0])
		view=0.0
		if entity in timePickle:
			view=timePickle[entity]
		cand[1]['tp']=view
	return c
Esempio n. 6
0
def disambiguateEntity(candidates, weights, resolvedEntities, factorWeights,
                       maxCount, currentId, limit):
    if len(candidates):
        max_score = limit
        aging_factor = 0.01
        best_candidate = None
        if currentId in resolvedEntities:
            del resolvedEntities[str(currentId)]
        candidates = normalizeTPs(candidates)
        for cand in candidates:
            candidate = cand[0]
            ss = cand[1]["ss"]
            associativeness = cand[1]["count"] / maxCount
            #			normalizationFactor=maxCoherence(weights, min(10,len(resolvedEntities)))
            normalizationFactor = 1.0
            coherence = computeCoherence(candidate, resolvedEntities,
                                         weights) / normalizationFactor
            lastId = getPreviousOccurrence(utils.normalizeURL(candidate),
                                           resolvedEntities, currentId - 1)
            recency = 0.0
            if lastId > -1:
                age = abs(currentId - lastId)
                recency = (1 - aging_factor)**age
            temporalPopularity = cand[1]["tp"]
            score = factorWeights['wss'] * ss + factorWeights[
                'wc'] * coherence + factorWeights[
                    "wa"] * associativeness + factorWeights[
                        'wr'] * recency + factorWeights[
                            'wt'] * temporalPopularity
            #print("%s\tSCORE: %f\tSS: %f\tCoh: %f\tAssoc: %f\tRecency: %f" % (cand[0], score, ss, coherence, associativeness, recency))
            if score > limit and (score > max_score or
                                  (score == max_score
                                   and len(candidate) < len(best_candidate))
                                  ) and not isDisambiguation(candidate):
                max_score = score
                best_candidate = candidate
        return utils.normalizeURL(best_candidate), max_score
    else:
        return "--NME--", 1.0
Esempio n. 7
0
def computeCoherence(newEntity, previousEntities, w):
	total=0.0
	current_id=len(previousEntities)+1
	other_id=current_id-1
	while other_id>0 and str(current_id-other_id) in w:
		diff=abs(current_id-other_id)
		weight=w[str(diff)]
		max_score=0.0
		if diff==1 or shouldITry(max_score, total, diff, current_id, w):
	#                       total+=computePairCoherence(graph.node[other_id]['eid'], newEntity.replace('http://dbpedia.org/resource/', ''), weight)
			if str(other_id) in previousEntities and previousEntities[str(other_id)]!='--NME--':
				total+=computeShortestPathCoherence(previousEntities[str(other_id)], utils.normalizeURL(newEntity), weight)
			other_id-=1
		else:
			break
	return total
Esempio n. 8
0
        def _getLink(elem, base):
            def _matchesEventLink(link):
                if not link:
                    return False

                try:
                    parsed = urlparse(link)
                except ValueError:
                    return False
                return (parsed.netloc == 'www.chemistry.gatech.edu'
                        and parsed.path.startswith(self.LINK_PREFIX_INCLUDE))

            links = elem.select(self.SELECTOR_EVENT_LINK)
            links = map(utils.Soup.getElemLink, links)
            links = map(utils.normalizeURL(base=base), links)
            links = filter(_matchesEventLink, links)
            return utils.firstOrNone(links)
Esempio n. 9
0
def generateCandidatesWithLOTUS(mention, minSize=10, maxSize=100):
	normalized=utils.normalizeURL(mention)
	fromCache=rds.get("lotus:%s" % normalized)
	if fromCache:
		cands=pickle.loads(fromCache)
	else:
		cands=getCandidatesForLemma(mention, minSize, maxSize)
		cands=cleanRedirects(cands)
		rds.set("lotus:" + normalized, pickle.dumps(cands))
	sortedCands=sorted(cands.items(), key=lambda x:x[1]["count"], reverse=True)
	#try:
	maxCount=getMaxCount(cands.items())
	#except:
#		print("we have an issue")
#		sys.exit(0)
#		maxCount=1
	return sortedCands, maxCount
Esempio n. 10
0
def generateCandidatesWithLOTUS(mention, minSize=10, maxSize=100):
    normalized = utils.normalizeURL(mention)
    fromCache = rds.get("lotus:%s" % normalized)
    if fromCache:
        cands = pickle.loads(fromCache)
    else:
        cands = getCandidatesForLemma(mention, minSize, maxSize)
        cands = cleanRedirects(cands)
        rds.set("lotus:" + normalized, pickle.dumps(cands))
    sortedCands = sorted(cands.items(),
                         key=lambda x: x[1]["count"],
                         reverse=True)
    #try:
    maxCount = getMaxCount(cands.items())
    #except:
    #		print("we have an issue")
    #		sys.exit(0)
    #		maxCount=1
    return sortedCands, maxCount
Esempio n. 11
0
def computeCoherence(newEntity, previousEntities, w):
    total = 0.0
    current_id = len(previousEntities) + 1
    other_id = current_id - 1
    while other_id > 0 and str(current_id - other_id) in w:
        diff = abs(current_id - other_id)
        weight = w[str(diff)]
        max_score = 0.0
        if diff == 1 or shouldITry(max_score, total, diff, current_id, w):
            #                       total+=computePairCoherence(graph.node[other_id]['eid'], newEntity.replace('http://dbpedia.org/resource/', ''), weight)
            if str(other_id) in previousEntities and previousEntities[str(
                    other_id)] != '--NME--':
                total += computeShortestPathCoherence(
                    previousEntities[str(other_id)],
                    utils.normalizeURL(newEntity), weight)
            other_id -= 1
        else:
            break
    return total
Esempio n. 12
0
    def getEventList(self):
        events = set()
        lastURL = self.DOMAIN
        nextURL = self.URL

        def _getLink(elem, base):
            def _matchesEventLink(link):
                if not link:
                    return False

                try:
                    parsed = urlparse(link)
                except ValueError:
                    return False
                return (parsed.netloc == 'www.chemistry.gatech.edu'
                        and parsed.path.startswith(self.LINK_PREFIX_INCLUDE))

            links = elem.select(self.SELECTOR_EVENT_LINK)
            links = map(utils.Soup.getElemLink, links)
            links = map(utils.normalizeURL(base=base), links)
            links = filter(_matchesEventLink, links)
            return utils.firstOrNone(links)

        while 1:
            nextURL = utils.normalizeURL(base=lastURL, url=nextURL)
            overview = self.requester.fetchURL(nextURL)
            if not overview:
                break

            soup = BeautifulSoup(overview, 'html.parser')
            evs = soup.select(self.SELECTOR_EVENTS)
            evs = map(lambda l: _getLink(l, nextURL), evs)
            events |= set(filter(None, evs))

            lastURL = nextURL
            nextURL = utils.Soup.getLinkAt(soup, self.SELECTOR_NEXT_PAGE)
            if not nextURL:
                break
        return events
Esempio n. 13
0
    def _getEventDetails(self, eventURL):
        details = self.requester.fetchURL(eventURL)
        if not details:
            return None

        soup = BeautifulSoup(details, 'html.parser')

        title1 = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TITLE1)
        title2 = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TITLE2)
        if title1 and title2:
            title = '%s: %s' % (title1, title2)
        else:
            title = utils.firstOrNone(filter(None, (title1, title2)))

        singleStr = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TIMESINGLE)
        startStr = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TIMESTART)
        endStr = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TIMEEND)
        startTime, endTime = self._parseEventTime(singleStr, startStr, endStr)
        startTime = utils.normalizeDate(startTime,
                                        self.config['defaults']['timezone'])

        event = RawEvent(self.IDENTIFIER, eventURL, title, startTime)
        event.setEnd(
            utils.normalizeDate(endTime, self.config['defaults']['timezone']))
        event.setLocation(
            utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_LOCATION))

        description, links = utils.Soup.tokenizeElemAt(
            soup, self.SELECTOR_EVENT_DESCRIPTION, base=eventURL)
        links |= set(utils.Soup.getLinksAt(soup, self.SELECTOR_EVENT_LINKS))
        links = map(utils.normalizeURL(base=eventURL), links)
        links = set(filter(None, links))
        event.setDescription(description)
        event.setLinks(links)

        return event
Esempio n. 14
0
if __name__=="__main__":
	if len(sys.argv)<3:
		print("Not enough arguments!!!")
		print("python run_naf.py {CORPUS/PATH} {FILENAME.TSV}")
		sys.exit(1)
	corpus=sys.argv[1]
	myFile=sys.argv[2]
	if not os.path.isfile(myFile):
		myConll=""
		corpus=corpus.strip('/')
		for file in os.listdir(corpus):
			if not file.endswith(".xml") and not file.endswith(".naf"):
				continue
			print(file)
			filename=corpus + '/' + file
			myXml, entities, mentions=utils.naf2inlineEntities(filename, True)
			da=dis_agdistis.disambiguate(myXml, "agdistis")
			for agd_entity in da:
				offset=str(agd_entity["start"])
				agd_link=utils.normalizeURL(str(agd_entity["disambiguatedURL"]))
				goldlink=utils.checkRedirects(utils.normalizeURL(str(entities[offset])))
				id=file + offset
				v1,v2=utils.getRanks(goldlink, agd_link)
				mention=mentions[offset]
				myConll+="%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (id, goldlink, agd_link, corpus, v1, v2, mention)
		w=open(myFile, "w")
		w.write(myConll)
	p, r, f1=utils.computeStats(myFile)
	
	print("Precision: %f, Recall: %f, F1-value: %f" % (p, r, f1))
    def getEventList(self):
        url = utils.normalizeURL(base=self.DOMAIN, url=self.URL)
        lastPosted = datetime.now()
        searchBacklog = timedelta(days=self.config['search_backlog'])
        crawlUntil = lastPosted - searchBacklog
        page = 0

        events = set()
        while lastPosted > crawlUntil:
            reqData = self.LIST_REQUEST_DATA.copy()
            reqData['page'] = page
            eventList = self.requester.fetchURL(url,
                                                method='POST',
                                                data=reqData)

            if not eventList:
                break

            res = json.loads(eventList)
            if (not isinstance(res, list)
                    or self._getHTMLInsertEntry(res) is None):
                logger.error(f'Malformed response from {url}: {eventList}')
                break

            res = self._getHTMLInsertEntry(res).get('data', '')
            soup = BeautifulSoup(res, 'html.parser')

            for event in soup.select(self.SELECTOR_EVENT):
                link = utils.Soup.getLinkAt(event, 'a')
                if link:
                    normalized = utils.normalizeURL(base=url, url=link)
                    if normalized:
                        events.add(normalized)

                posted = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_POSTED)
                if posted:
                    try:
                        postedTime = datetime.strptime(posted,
                                                       self.FORMAT_POSTED)
                        lastPosted = min(lastPosted, postedTime)
                    except ValueError:
                        logger.warning('Unable to parse posted time {posted}')
                        lastPosted += timedelta(days=1)

            if not soup.select(self.SELECTOR_NEXT_PAGE):
                break

            page += 1

        updatesSince = datetime.now() - searchBacklog
        requestTimestamp = int(updatesSince.timestamp())
        url = self.UPDATED_URL.format(requestTimestamp)
        url = utils.normalizeURL(base=self.DOMAIN, url=url)
        updatedEventList = self.requester.fetchURL(url, json=True)

        if updatedEventList is not None:
            for eventId in updatedEventList:
                url = self.EVENT_URL.format(eventId)
                events.add(utils.normalizeURL(base=self.DOMAIN, url=url))
        else:
            logger.warning('Cannot fetch recently updated events!')

        return events
Esempio n. 16
0
def run(corpus, myFile, topic, aggregatedTopics):
    if not os.path.isfile(myFile) or aggregatedTopics:
        entitiesNumber = 0
        with open(corpus, "r") as myCorpus:
            currentArticle = ""
            currentTopic = ""
            myConll = ""
            openEntity = False
            articleEntities = 0
            registeredEntities = 0
            relevant = False
            offset = 0
            tid = 1
            allTokens = {}
            goldEntities = {}
            goldMentions = {}
            print("Topic is %s" % topic)
            for line in myCorpus:
                if line.startswith("-DOCSTART-"):
                    if 'testb' in line:
                        if currentArticle != "":
                            if openEntity:
                                openEntity = False
                                allTokens[str(tid - 1)]['text'] += '</entity>'
                                registeredEntities += 1
                            if registeredEntities < articleEntities:
                                print(registeredEntities, articleEntities)
                                sys.exit(0)
                            if not aggregatedTopics:
                                articleEntities = 0
                                registeredEntities = 0
                                myXml = utils.composeText(allTokens)
                                da = dis_agdistis.disambiguate(
                                    myXml, "agdistis")
                                for agd_entity in sorted(
                                        da, key=lambda k: k['start']):
                                    offset = str(agd_entity["start"])
                                    agd_link = utils.normalizeURL(
                                        agd_entity["disambiguatedURL"])
                                    goldlink = utils.checkRedirects(
                                        utils.normalizeURL(
                                            goldEntities[offset]))
                                    id = currentArticle + offset
                                    mention = goldMentions[offset]
                                    v1, v2 = utils.getRanks(goldlink, agd_link)
                                    myConll += "%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (
                                        id, goldlink, agd_link, currentTopic,
                                        v1, v2, mention)
                        testB = True
                        line = line.strip()
                        articleInfo = line.split('\t')
                        currentTopic = articleInfo[1]
                        if aggregatedTopics and topic != currentTopic:
                            relevant = False
                            currentArticle = ''
                        else:
                            currentArticle = articleInfo[0]
                            relevant = True
                            print("Article %s has topic %s." %
                                  (currentArticle, currentTopic))
                        if not aggregatedTopics:
                            offset = 0
                            tid = 1
                            allTokens = {}
                            goldEntities = {}
                            goldMentions = {}
                    else:
                        testB = False
                elif testB and relevant:
                    tokenInfo = line.split('\t')
                    text = tokenInfo[0]
                    if tokenInfo[1].strip() != 'I' and openEntity is True:
                        openEntity = False
                        allTokens[str(tid - 1)]['text'] += '</entity>'
                        registeredEntities += 1
                    if tokenInfo[1].strip() == 'B':
                        goldMentions[str(offset)] = tokenInfo[2].strip()
                        entitiesNumber += 1
                        articleEntities += 1
                        if tokenInfo[3] == '--NME--':
                            goldEntities[str(offset)] = tokenInfo[3]
                        else:
                            goldEntities[str(offset)] = tokenInfo[4]
                        text = '<entity>' + text
                        if tokenInfo[0].strip() == tokenInfo[2].strip():
                            text += '</entity>'
                            registeredEntities += 1
                        else:
                            openEntity = True

                    allTokens[str(tid)] = {'text': text, 'offset': str(offset)}
                    offset += len(tokenInfo[0]) + 1
                    tid += 1

            if openEntity:
                allTokens[str(tid - 1)]['text'] += '</entity>'
                registeredEntities += 1
            if registeredEntities < articleEntities:
                print(registeredEntities, articleEntities)
                sys.exit(0)
            if currentArticle or aggregatedTopics:
                if aggregatedTopics:
                    currentTopic = topic
                myXml = utils.composeText(allTokens)
                da = dis_agdistis.disambiguate(myXml, "agdistis")
                for agd_entity in sorted(da, key=lambda k: k['start']):
                    offset = str(agd_entity["start"])
                    agd_link = utils.normalizeURL(
                        agd_entity["disambiguatedURL"])
                    goldlink = utils.checkRedirects(
                        utils.normalizeURL(goldEntities[offset]))
                    mention = goldMentions[offset]
                    id = currentArticle + offset
                    v1, v2 = utils.getRanks(goldlink, agd_link)
                    print(v1, v2)
                    myConll += "%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (
                        id, goldlink, agd_link, currentTopic, v1, v2, mention)

        print(entitiesNumber)
        with open(myFile, "a") as w:
            w.write(myConll)
    def _getEventDetails(self, eventURL):
        details = self.requester.fetchURL(f'{eventURL}/xml',
                                          errorOnCode=(403, ))
        if not details:
            return None

        soup = BeautifulSoup(details, 'xml')
        if utils.Soup.getTextAt(soup, self.SELECTOR_TYPE) != 'event':
            return set()
        title = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_TITLE)
        location = utils.Soup.getTextAt(soup, self.SELECTOR_EVENT_LOCATION)

        description = utils.Soup.getTextAt(soup,
                                           self.SELECTOR_EVENT_DESCRIPTION)
        links = set()
        if description:
            descSoup = BeautifulSoup(description, 'html.parser')
            description, links = utils.HTMLToText.tokenizeSoup(descSoup,
                                                               base=eventURL)

        relatedLinks = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_LINKS)
        if relatedLinks:
            links |= set(relatedLinks)

        urls = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_URLS)
        if urls:
            links |= set(urls)
        links = map(utils.normalizeURL(base=eventURL), links)
        links = set(filter(None, links))

        audience = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_AUDIENCE)
        if audience:
            audience = ', '.join(audience)

        extras = utils.Soup.getTextsAt(soup, self.SELECTOR_EVENT_EXTRAS)
        if extras:

            def _cleanExtra(xtr):
                return xtr.replace('_', ' ').capitalize()

            extras = ', '.join(map(_cleanExtra, extras))

        # need to fetch human readable site because XML does not contain status
        status = None
        details2 = self.requester.fetchURL(eventURL, errorOnCode=(403, ))
        soup2 = BeautifulSoup(details2 or '', 'lxml')
        if details2:
            # HTML tree is broken sometimes, which will confuse the python parser
            statusBlock = self._getBlockFromList(
                soup2, self.SELECTOR_EVENT_HTMLMETADATA, 'status')

            if statusBlock:
                statusElem = self._getBlockDetail(statusBlock,
                                                  'workflow status')
                status = utils.Soup.getElemText(statusElem)
                status = status.lower()

        # parse dates from HTML, because timezones and recurring events are
        # entirely messed up in the XML (some rrules are really weird and in no
        # way generate the event instances listed in HTML, eg http://hg.gatech.edu/node/623952)
        htmlTimes = soup2.select(self.SELECTOR_EVENT_HTMLTIMES)

        events = set()
        for n, timeEntry in enumerate(htmlTimes):
            startTime, endTime = self._parseHTMLEventTime(timeEntry)
            if not startTime:
                logger.error(
                    f'Cannot parse event time for {eventURL}: {utils.Soup.getElemText(timeEntry)}'
                )
                continue

            rawEvent = RawEvent(self.IDENTIFIER, f'{eventURL}#{n}', title,
                                startTime)
            rawEvent.setEnd(endTime)
            rawEvent.setDescription(description)
            rawEvent.setLinks(links)
            if location:
                rawEvent.setLocation(location)
            if audience:
                rawEvent.setAudience(audience)
            if extras:
                rawEvent.setExtras(extras)
            if status:
                rawEvent.setStatus(status)
            events.add(rawEvent)
        return events
Esempio n. 18
0
        sys.exit(1)
    corpus = sys.argv[1]
    myFile = sys.argv[2]
    if not os.path.isfile(myFile):
        myConll = ""
        corpus = corpus.strip('/')
        for file in os.listdir(corpus):
            if not file.endswith(".xml") and not file.endswith(".naf"):
                continue
            print(file)
            filename = corpus + '/' + file
            myXml, entities, mentions = utils.naf2inlineEntities(
                filename, True)
            da = dis_agdistis.disambiguate(myXml, "agdistis")
            for agd_entity in da:
                offset = str(agd_entity["start"])
                agd_link = utils.normalizeURL(
                    str(agd_entity["disambiguatedURL"]))
                goldlink = utils.checkRedirects(
                    utils.normalizeURL(str(entities[offset])))
                id = file + offset
                v1, v2 = utils.getRanks(goldlink, agd_link)
                mention = mentions[offset]
                myConll += "%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (
                    id, goldlink, agd_link, corpus, v1, v2, mention)
        w = open(myFile, "w")
        w.write(myConll)
    p, r, f1 = utils.computeStats(myFile)

    print("Precision: %f, Recall: %f, F1-value: %f" % (p, r, f1))
Esempio n. 19
0
def run(corpus, myFile, topic, aggregatedTopics):
	if not os.path.isfile(myFile) or aggregatedTopics:
		entitiesNumber=0
		with open(corpus, "r") as myCorpus:
			currentArticle=""
			currentTopic=""
			myConll=""
			openEntity=False
			articleEntities=0
			registeredEntities=0
			relevant=False
			offset=0
			tid=1
			allTokens={}
			goldEntities={}
			goldMentions={}
			print("Topic is %s" % topic)
			for line in myCorpus:
				if line.startswith("-DOCSTART-"):
					if 'testb' in line:
						if currentArticle!="":
							if openEntity:
								openEntity=False
								allTokens[str(tid-1)]['text']+='</entity>'
								registeredEntities+=1
							if registeredEntities<articleEntities:
								print(registeredEntities, articleEntities)
								sys.exit(0)
							if not aggregatedTopics:
								articleEntities=0
								registeredEntities=0
								myXml=utils.composeText(allTokens)
								da=dis_agdistis.disambiguate(myXml, "agdistis")
								for agd_entity in sorted(da, key=lambda k: k['start']):
									offset=str(agd_entity["start"])
									agd_link=utils.normalizeURL(agd_entity["disambiguatedURL"])
									goldlink=utils.checkRedirects(utils.normalizeURL(goldEntities[offset]))
									id=currentArticle + offset
									mention=goldMentions[offset]
									v1,v2=utils.getRanks(goldlink, agd_link)
									myConll+="%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (id, goldlink, agd_link, currentTopic, v1, v2, mention)
						testB=True
						line=line.strip()
						articleInfo=line.split('\t')
						currentTopic=articleInfo[1]
						if aggregatedTopics and topic!=currentTopic:
							relevant=False
							currentArticle=''
						else:
							currentArticle=articleInfo[0]
							relevant=True
							print("Article %s has topic %s." % (currentArticle, currentTopic))
						if not aggregatedTopics:
							offset=0
							tid=1
							allTokens={}
							goldEntities={}
							goldMentions={}
					else:
						testB=False
				elif testB and relevant:
					tokenInfo=line.split('\t')
					text=tokenInfo[0]
					if tokenInfo[1].strip()!='I' and openEntity is True:
						openEntity=False	
						allTokens[str(tid-1)]['text']+='</entity>'
						registeredEntities+=1
					if tokenInfo[1].strip()=='B':
						goldMentions[str(offset)]=tokenInfo[2].strip()
						entitiesNumber+=1
						articleEntities+=1
						if tokenInfo[3]=='--NME--':
							goldEntities[str(offset)]=tokenInfo[3]
						else:
							goldEntities[str(offset)]=tokenInfo[4]
						text='<entity>' + text
						if tokenInfo[0].strip()==tokenInfo[2].strip():
							text+='</entity>'
							registeredEntities+=1
						else:
							openEntity=True
					
					allTokens[str(tid)]={'text': text, 'offset': str(offset)}
					offset+=len(tokenInfo[0]) + 1
					tid+=1

			if openEntity:
				allTokens[str(tid-1)]['text']+='</entity>'
				registeredEntities+=1
			if registeredEntities<articleEntities:
				print(registeredEntities, articleEntities)
				sys.exit(0)
			if currentArticle or aggregatedTopics:
				if aggregatedTopics:
					currentTopic=topic
				myXml=utils.composeText(allTokens)
				da=dis_agdistis.disambiguate(myXml, "agdistis")
				for agd_entity in sorted(da, key=lambda k: k['start']):
					offset=str(agd_entity["start"])
					agd_link=utils.normalizeURL(agd_entity["disambiguatedURL"])
					goldlink=utils.checkRedirects(utils.normalizeURL(goldEntities[offset]))
					mention=goldMentions[offset]
					id=currentArticle + offset
					v1,v2=utils.getRanks(goldlink, agd_link)
					print(v1,v2)
					myConll+="%s\t%s\t%s\t%s\t%f\t%f\t%s\n" % (id, goldlink, agd_link, currentTopic, v1, v2, mention)

		print(entitiesNumber)	
		with open(myFile, "a") as w:
			w.write(myConll)