Esempio n. 1
0
def main():
    scriptsDatabase = []
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    for file in onlyfiles:
        fileNoExtension = os.path.splitext(file)[0]
        # fileNoExtension = 'bolt-eng-DF-170-181125-9125545'
        # file = 'bolt-eng-DF-170-181125-9125545.ann'
        logging.debug("This is the file I am processing: %s" % fileNoExtension)
        if os.path.splitext(file)[1] == '.ann':
            script = event.Script(fileNoExtension)
            eventsTagsPairList = getListOfAfterLinks(script)
            newLinks = getListOfExtraAfterLinks(script)
            eventsTagsPairList = eventsTagsPairList + newLinks
            logging.debug("This is the list of unordered events: %s", eventsTagsPairList)
            orderedEventsLists = createEventsClusters(eventsTagsPairList)
            logging.debug("This is the list of ordered events: %s", orderedEventsLists)
            for oel in orderedEventsLists:
                eventList = oel[0]
                eventsSequenceList = []
                n = 1
                for ev in eventList:
                    eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[0])
                    eventsSequenceList.append(eventType + '-' + eventSubtype)
                eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[1])
                eventsSequenceList.append(eventType + '-' + eventSubtype)
                scriptsDatabase.append([script.scriptName, eventsSequenceList])
    scripts = ['#'.join(x[1]) for x in scriptsDatabase]

    counts = Counter(scripts)
    print(counts)

    # logging.debug("This is the total number of files %d", len(onlyfiles))
    #
    script = event.Script('bolt-eng-DF-170-181125-9125545')
    eventsTagsPairList = getListOfAfterLinks(script)
    #
    orderedEventsLists = createEventsClusters(eventsTagsPairList)
    print(orderedEventsLists)

    distances = getAverageStoryEventPairsDistance()
    print(distances)
    print(numpy.mean(distances))
    oel = computeOrderedSequenceList()
    probabilities = computePairsProbabilities()

    coreferences = getCoreferenceLinks()
    print(1)
Esempio n. 2
0
def computePairsProbabilities():
    """
    This procedure takes all the events with after links from a newsarticle file. It then creates the ordered events clusters (pairs of events), and then it computes the statistics on the events type-subtype for instance:
    7-3e7-0 means type 7-subtype 4 goes to type 7-subtype 0 (as per the constants file).
    It turns out that there are only 133 pairs of possible events with after links in the whole corpus.
    :return:
    """
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    allGoodEventPairs = []
    for file in onlyfiles:
        fileNoExtension = os.path.splitext(file)[0]
        logging.debug("This is the file I am processing: %s" % fileNoExtension)
        if os.path.splitext(file)[1]:
            script = event.Script(fileNoExtension)
            eventsTagsPairList = getListOfExtraAfterLinks(script)
            logging.debug("This is the list of unordered events: %s", eventsTagsPairList)
            orderedEventsLists = createEventsClusters(eventsTagsPairList)
            logging.debug("This is the list of ordered events: %s", orderedEventsLists)
            for oel in orderedEventsLists:
                eventList = oel[0]
                n = 1
                for ev in eventList:
                    eventsPair = []
                    try:
                        eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[0])
                    except:
                        logging.info('Scramble issue with %s', script.scriptName)
                    logging.debug("Events: %s, %s", eventType, eventSubtype)
                    eventTypeNumber = c.EVENTTYPES[eventType]
                    eventSubTypeNumber = c.EVENTSUBTYPES[eventTypeNumber].index(eventSubtype)
                    eventPairString1 = str(eventTypeNumber) + '-' + str(eventSubTypeNumber)
                    try:
                        eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[1])
                    except:
                        logging.info('Scramble issue with %s', script.scriptName)
                    logging.debug("Events: %s, %s", eventType, eventSubtype)
                    eventTypeNumber = c.EVENTTYPES[eventType]
                    eventSubTypeNumber = c.EVENTSUBTYPES[eventTypeNumber].index(eventSubtype)
                    eventPairString2 = str(eventTypeNumber)+'-'+str(eventSubTypeNumber)
                    eventPairString = eventPairString1+'e'+eventPairString2
                    allGoodEventPairs.append(eventPairString)
    counts = Counter(allGoodEventPairs)
    keys = counts.keys()
    numberOfEvents = len(allGoodEventPairs)
    probabilities = {}
    for key in keys:
        probabilities[key] = counts[key]/float(numberOfEvents)
    return counts, probabilities
Esempio n. 3
0
def computeOrderedSequenceList():
    """
    This procedure returns a list of ordered events
    :return:
    """
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    for file in onlyfiles:
        fileNoExtension = os.path.splitext(file)[0]
        logging.debug("This is the file I am processing: %s" % fileNoExtension)
        if os.path.splitext(file)[1]:
            script = event.Script(fileNoExtension)
            eventsTagsPairList = getListOfAfterLinks(script)
            logging.debug("This is the list of unordered events: %s", eventsTagsPairList)
            orderedEventsLists = createEventsClusters(eventsTagsPairList)
            logging.debug("This is the list of ordered events: %s", orderedEventsLists)
    return orderedEventsLists
Esempio n. 4
0
def getCoreferenceLinks():
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    i = 0
    for file in onlyfiles:
        fileNoExtension = os.path.splitext(file)[0]
        logging.debug("This is the file I am processing: %s" % fileNoExtension)
        if os.path.splitext(file)[1]:
            script = event.Script(fileNoExtension)
            # <type 'list'>: [[[['E225', 'E459']], 0], [[['E446', 'E407']], 0]]
            eventsTagsPairList = getListOfAfterLinks(script)
            coreferenceLinks = script.getListOfCoreferenceClusters()
            logging.debug("This is the list of unordered events: %s", eventsTagsPairList)
            for eventPair in eventsTagsPairList:
                i = i+1
                logging.info("Event1: %d %s", i, eventPair[0][0][0])
                logging.info("Event2: %d %s", i, eventPair[0][0][1])
    return 1
Esempio n. 5
0
 def checkLinksConsistency(self):
     incons = 0
     articleName = self.articleName.replace(self.path, "")
     articleName = articleName.replace(".txt", "")
     script = ev.Script(articleName)
     eventsList = script.getListOfEventsWithAfterLinks()
     logging.debug("This is the list of events: %s", eventsList)
     eventsPositions = {}
     for event in eventsList:
         eventId = event.eventTag
         logging.debug("This is the start and stop: %s",
                       event.textStartStop)
         # eventPosition = article.getSentenceNumberFromEvent(event)
         eventPosition = self.getTreePositionFromEvent(event)
         logging.debug("The sentence is at this position: %s\n\n\n",
                       eventPosition)
         eventsPositions[eventId] = eventPosition
     clusters = script.eventsClusters
     logging.debug("This is the positions of the events: %s",
                   eventsPositions)
     logging.debug("These are the events clusters: %s", clusters)
     clusterPositionsList = []
     for cluster in clusters:
         positionList = []
         for eventPair in cluster[0]:
             ev1 = eventPair[0]
             ev2 = eventPair[1]
             pos1 = eventsPositions[ev1]
             pos2 = eventsPositions[ev2]
             positionList.append(pos1)
             positionList.append(pos2)
         clusterPositionsList.append(positionList)
     logging.debug("These are the position lists: %s", clusterPositionsList)
     for posList in clusterPositionsList:
         #for position in posList:
         unique = set(posList)
         if len(unique) > 1:
             incons = 1
             logging.debug("I have found an exception, ")
     return incons
Esempio n. 6
0
 def __init__(self, articleName, path):
     self.path = path
     self.articleNameNoPath = articleName
     self.articleName = path + articleName
     logging.debug(self.articleName)
     scriptName = articleName.replace(".txt", "")
     self.script = ev.Script(scriptName)
     # The following is the list of all the event objects in the script.
     self.articleEventsList = self.script.eventsList
     self.articleTree = etree.parse(self.articleName)
     self.articleType = self.getArticleType()
     if self.articleType == "story":
         self.docRoot = self.articleTree.xpath("//DOC[@type='story']")
         self.headline = self.getStoryHeadline()
         self.sentences = self.getStorySentences()
     else:
         self.docRoot = self.articleTree.getroot()
         #self.posts = self.getPosts()
     logging.debug("This is the file name: %s", articleName)
     with open(path + articleName) as f:
         lines = f.read().splitlines()
     fileAsCharactersList = '\n'.join(lines)
     self.fileAsCharactersList = re.sub(r'[^\x00-\x7F]+', ' ',
                                        fileAsCharactersList)
Esempio n. 7
0
def main():
    parserName = 'parseroutput-'+const.ANNDIR[0:-1]+'.tbf'
    with open(parserName, 'r') as po:
        lines = po.readlines()
    lines = [l.rstrip() for l in lines]
    probs = stats.computePairsProbabilities()
    countersProbs = probs[0]
    documents = {}
    first = True
    parserNameOut = 'postprocessoutput'+const.ANNDIR[0:-1]+'.tbf'
    with open(parserNameOut, 'w') as ppo:
        for line in lines:
            m = re.match('^#BeginOfDocument (.*)$', line)
            c = re.match('^@Coreference\tR[0-9]+\t(.*)$', line)
            a = re.match('^@After\tR[0-9]+\t(E[0-9]+),(E[0-9]+)$', line)
            if m:
                ppo.write("%s\n" % line)
                if documents == {} and first == True:
                    key = m.group(1)
                    script = ev.Script(key)
                    first = False
                    list = []
                    cList = []
                    caList = []
                else:
                    documents[key] = list
                    key = m.group(1)
                    script = ev.Script(key)
                    list = []
                    cList = []
                    caList = []
            elif c:
                clusterGroup = c.group(1)
                clusterGroup = clusterGroup.split(',')
                cList.append(clusterGroup)
                ppo.write("%s\n" % line)
            elif a:
                skipPair = False
                event1 = script.getEventByEventId(a.group(1))
                event2 = script.getEventByEventId(a.group(2))
                eventsPairScript = createEventsPairScript(event1,event2)
                event1cg = getClusterGroup(cList, a.group(1))
                event2cg = getClusterGroup(cList, a.group(2))

                logging.debug("These are the cluster groups: %s %s", event1cg, event2cg)
                if event1cg != -1 and event2cg != -1:
                    logging.debug("Found a pair: %s %s, for the document %s", event1cg, event2cg, key)
                    logging.debug("This is the current list: %s", caList)
                    if(event1cg == event2cg):
                        skipPair = True
                    elif ((event1cg, event2cg) in caList):
                        skipPair = True
                    else:
                        skipPair = False
                        caList.append((event1cg, event2cg))
                        caList.append((event2cg, event1cg))

                if (event1cg == -1) ^ (event2cg == -1):
                    logging.debug("Found a pair: %s %s, for the document %s", event1cg, event2cg, key)
                    logging.debug("This is the current list: %s", caList)
                    if(event1cg == -1):
                        event1cg = a.group(1)
                    else:
                        event2cg = a.group(2)
                    if ((event1cg, event2cg) in caList):
                        skipPair = True
                    else:
                        skipPair = False
                        caList.append((event1cg, event2cg))
                        caList.append((event2cg, event1cg))

                if eventsPairScript in countersProbs:
                    if ((a.group(1), a.group(2)) not in list and (a.group(2), a.group(1)) not in list and skipPair == False):
                        ppo.write("%s\n" % line)
                        list.append((a.group(1), a.group(2)))
            else:
                ppo.write("%s\n" % line)