Esempio n. 1
0
    def run(self, record, entry_year):
        calledRecord = CalledRecordDiagnoseYr(record.ruid, record.entry_date,
                                              record.content)
        calledRecord.calledRule = self.name
        record = record.content
        #msRegex = r'.{0,' + str(self.msLimit) + '}multiple\ssclerosis|multiplesclerosis|\sms\s|:ms\s.{0,' + str(self.msLimit) + '}'
        msRegex  = r'.{0,' + str(self.impressionLimit) + '}multiple\ssclerosis.{0,' + str(self.impressionLimit) + '}|.{0,' \
        + str(self.impressionLimit) + '}multiplesclerosis.{0,' + str(self.impressionLimit) + '}|.{0,' \
        + str(self.impressionLimit) + '}\sMS\s' \
        + '.{0,' + str(self.impressionLimit) + '}|.{0,' + str(self.impressionLimit) + '}:MS\s' \
        + '.{0,' + str(self.impressionLimit) + '}|.{0,' + str(self.impressionLimit) + '}\sMS\.(?!\s*\*\*NAME).{0,' + str(self.impressionLimit) + '}'
        ### A lot of the positive records I was missing are the ones that are diagnosed in the visit

        #Search for known significant medical diagnoses and conditions and
        #confirm that it isn't MS. Then see if the Impression left afterwards is a diagnosis of MS
        diagnosesRegex = r'medical\sdiagnoses\sand\sconditions.{0,' + str(
            self.diagnosesLimit) + '}|Diagnosis:.{0,' + str(
                self.diagnosesLimit) + '}'
        diagnosesMatch = re.search(diagnosesRegex, record, re.IGNORECASE)
        if (diagnosesMatch):
            msMatch = re.search(msRegex, diagnosesMatch.group(), re.IGNORECASE)
            if (msMatch):
                return False

            impressionRegex = r"Impression:.{0," + str(
                self.impressionLimit) + "}"
            impressionMatch = re.search(impressionRegex, record, re.IGNORECASE)
            if (impressionMatch):
                msMatch = re.search(msRegex, impressionMatch.group(),
                                    re.IGNORECASE)

                if (msMatch):
                    #look for negating language
                    # negRegex = r'not|can\'t|will\snot|cannot|can\snot|won\'t|ruledout|ruled\sout'
                    # negMatch = re.search(negRegex, msMatch.group(), re.IGNORECASE)
                    #
                    # if(negMatch):
                    #     return False

                    #at this point, the patient has MS, this only picks out the ones being presently diagnosed
                    diagnosRegex = r'.{0,' + str(
                        self.lowerLimit) + '}diagnos.{0,' + str(
                            self.upperLimit) + '}'
                    diagnosMatch = re.search(diagnosRegex, msMatch.group(),
                                             re.IGNORECASE)

                    if (diagnosMatch):
                        calledRecord.calledYear = str(entry_year)
                        calledRecord.calledText = msMatch.group(
                        ) + '\t' + diagnosMatch.group()
                        return calledRecord

        return False
Esempio n. 2
0
    def run(self, record, entry_year):
        calledRecord = CalledRecordDiagnoseYr(record.ruid, record.entry_date, record.content)
        calledRecord.calledRule = self.name
        record = record.content
        #msRegex = r'.{0,' + str(self.msLimit) + '}multiple\ssclerosis|multiplesclerosis|\sms\s|:ms\s.{0,' + str(self.msLimit) + '}'
        msRegex  = r'.{0,' + str(self.impressionLimit) + '}multiple\ssclerosis.{0,' + str(self.impressionLimit) + '}|.{0,' \
        + str(self.impressionLimit) + '}multiplesclerosis.{0,' + str(self.impressionLimit) + '}|.{0,' \
        + str(self.impressionLimit) + '}\sMS\s' \
        + '.{0,' + str(self.impressionLimit) + '}|.{0,' + str(self.impressionLimit) + '}:MS\s' \
        + '.{0,' + str(self.impressionLimit) + '}|.{0,' + str(self.impressionLimit) + '}\sMS\.(?!\s*\*\*NAME).{0,' + str(self.impressionLimit) + '}'
        ### A lot of the positive records I was missing are the ones that are diagnosed in the visit

        #Search for known significant medical diagnoses and conditions and
        #confirm that it isn't MS. Then see if the Impression left afterwards is a diagnosis of MS
        diagnosesRegex = r'medical\sdiagnoses\sand\sconditions.{0,' + str(self.diagnosesLimit) + '}|Diagnosis:.{0,' + str(self.diagnosesLimit) + '}'
        diagnosesMatch = re.search(diagnosesRegex, record, re.IGNORECASE)
        if(diagnosesMatch):
            msMatch = re.search(msRegex, diagnosesMatch.group(), re.IGNORECASE)
            if(msMatch):
                return False

            impressionRegex = r"Impression:.{0," + str(self.impressionLimit) + "}"
            impressionMatch = re.search(impressionRegex, record, re.IGNORECASE)
            if(impressionMatch):
                msMatch = re.search(msRegex, impressionMatch.group(), re.IGNORECASE)

                if(msMatch):
                        #look for negating language
                        # negRegex = r'not|can\'t|will\snot|cannot|can\snot|won\'t|ruledout|ruled\sout'
                        # negMatch = re.search(negRegex, msMatch.group(), re.IGNORECASE)
                        #
                        # if(negMatch):
                        #     return False

                        #at this point, the patient has MS, this only picks out the ones being presently diagnosed
                        diagnosRegex = r'.{0,' + str(self.lowerLimit) + '}diagnos.{0,' + str(self.upperLimit) + '}'
                        diagnosMatch = re.search(diagnosRegex, msMatch.group(), re.IGNORECASE)

                        if(diagnosMatch):
                            calledRecord.calledYear = str(entry_year)
                            calledRecord.calledText = msMatch.group() + '\t' + diagnosMatch.group()
                            return calledRecord

        return False
Esempio n. 3
0
    def run(self, record, entry_year):
        #some general organizing stuff for later when it comes to outputting record data
        calledRecord = CalledRecordDiagnoseYr(record.ruid, record.entry_date, record.content)
        #lets me know as I check the accuracy of the algorithm that it was this rule that called the year
        calledRecord.calledRule = self.name

        #matches diagnosis, diagnosed, diagnos or multiple sclerosis or ms and returns the upperlimit amount of characters after and the lowerlimit of characters before
        #I use this kind of regex a lot -> .{0, (some number)}(other regex).{0, (some number)} and all it does is grabs the amount of characters in that direction so
        # .{0,100}diagnos.{0,100} would grab 100 characters before and after any 'diagnos' string
        regex = r'.{0,' + str(self.lowerLimit) + '}diagnos.{0,' + str(self.upperLimit) + '}'
        #re.IGNORECASE is really important. It slows down the algorithm A LOT but it greatly increases its accuracy
        diagnoseMatches = re.findall(regex, record.content, re.IGNORECASE)

        #used to find out the most common year repeated in this one record
        yearMaps = []

        for diagnoseMatch in diagnoseMatches:
            #even though it's only around 200 characters total, I found
            #that splitting it up by sentence was still helpful to rule out
            #false positives. We also split it up by new line, >, <, in addition
            #to periods.
            #splitDiagnoseMatches = diagnoseMatch.split('.')
            #I took this sentence split from Dr. Davis's script here remarks are:
            ####Split the note into sentences or phrases. ---- and >< were common things I found that displayed long lists
            ####instead of phrases and made my searching more difficult
            splitDiagnoseMatches = diagnoseMatch.split('(?:\.\s+)|(?:-(?:\s+)?)+|(?:>\s?<)|\\n')
            #for each sentence
            for splitDiagnoseMatch in splitDiagnoseMatches:
                #Check for negating language here. This could be done better but serves our
                #purposes for now
                regex = r'\sno.\s|can\'t|cannot|negative|possible'
                negMatches = re.findall(regex, splitDiagnoseMatch, re.IGNORECASE)

                #if there aren't any negative language
                if(len(negMatches) == 0):
                    #Checks for wording that needs to appear before any variation of MS
                    #the amount of characters to check preceding MS is degined by lowerlimit at the
                    #start of this record
                    beforeMSRegex = r'.{0,' + str(self.lowerLimit) + '}multiple\ssclerosis|.{0,' \
                    + str(self.lowerLimit) + '}multiplesclerosis|.{0,' \
                    + str(self.lowerLimit) + '}\sMS\s' \
                    + '|.{0,' + str(self.lowerLimit) + '}:MS\s' \
                    + '|.{0,' + str(self.lowerLimit) + '}\sMS\.(?!\s*\*\*NAME)' \
                    + '|.{0,' + str(self.lowerLimit) + '}:MS-.'
                    beforeMSMatches = re.findall(beforeMSRegex, splitDiagnoseMatch, re.IGNORECASE)
                    #iterate through all the matches
                    for beforeMSMatch in beforeMSMatches:
                        #if the specific "diagnosed in" appears before the year than no need for tie breaker, go with that year
                        diagnosedInRegex = "diagnosed\sin\s(19|20)\d{2}"
                        #re.search is different than re.findall in that it only looks for the first match and then returns out of it
                        #in order to read that match you access it with .group()
                        diagnosedInMatch = re.search(diagnosedInRegex, beforeMSMatch, re.IGNORECASE)
                        if(diagnosedInMatch):
                            #this is a hard return meaning that it calls the diagnosis year right here and now
                            #gets the last 4 letters of that match which we know from the regex it's the year
                            calledRecord.calledYear = diagnosedInMatch.group()[-4:]
                            calledRecord.calledText = beforeMSMatch
                            #we don't use hardCall here because we want this to be included with a common consensus
                            #year calculation made in identifyDiagnosisYear.py
                            return calledRecord


                    #special cases of wording. We completely return out if these specific wordings are found
                    specialMSRegex = r'(diagnosis\sof\s)((ms|multiplesclerosis|multiple\ssclerosis|))(\swas\smade\sin\s(19|20)\d{2})'
                    specialMSMatch = re.search(specialMSRegex, splitDiagnoseMatch, re.IGNORECASE)
                    if(specialMSMatch):
                        #this is a hard return meaning that it calls the diagnosis year right here and now
                        #gets the last 4 letters of that match which we know from the regex it's the year
                        calledRecord.calledYear = specialMSMatch.group()[-4:]
                        calledRecord.calledText = specialMSMatch.group()
                        #.hardCall = True is used in identifyDiagnosisYear.py to ignore finding a common consensus year
                        #and to just go with this year found here
                        calledRecord.hardCall = True
                        return calledRecord

                    specialMSRegex = r'diagnosed\swith\s(multiple\ssclerosis|MS|multiplesclerosis)\sin\s(19|20)\d{2}'
                    specialMSMatch = re.search(specialMSRegex, splitDiagnoseMatch, re.IGNORECASE)
                    if(specialMSMatch):
                        #this is a hard return meaning that it calls the diagnosis year right here and now
                        calledRecord.calledYear = specialMSMatch.group()[-4:]
                        calledRecord.calledText = specialMSMatch.group()
                        #.hardCall = True is used in identifyDiagnosisYear.py to ignore finding a common consensus year
                        #and to just go with this year found here
                        calledRecord.hardCall = True
                        return calledRecord



                    #search for any variation of MS in the sentence
                    regex =  r'multiple\ssclerosis|' \
                    + 'multiplesclerosis|' \
                    + '\sMS\s' \
                    + '|:MS\s' \
                    + '|\sMS\.(?!\s*\*\*NAME)' \
                    + '|:MS-'
                    MSFound = re.search(regex, splitDiagnoseMatch, re.IGNORECASE)
                    if(MSFound):
                        #if the phrase "Known Significant Medical Diagnoses and Conditions:" appears in the match
                        #make sure the year is in very close proximity to MS. This means we are in the header and
                        #we need to be careful that the year corresponds with the mention of MS
                        knownDiagnosesRegex = r'known\ssignificant\smedical\sdiagnoses\s'
                        knownDiagnosesMatch = re.search(knownDiagnosesRegex, splitDiagnoseMatch, re.IGNORECASE)
                        if(knownDiagnosesMatch):
                            knownMSMatch = re.search(self.smallerBoundsMSRegex, splitDiagnoseMatch, re.IGNORECASE)
                            if(knownMSMatch):
                                yearRegex = "(19|20|\')\d{2}"
                                specificYrMatch = re.search(yearRegex, knownMSMatch.group())
                                if(specificYrMatch):
                                    specificYr = specificYrMatch.group()

                                    #don't take a year after this certain phrase because it's going to be wrong. This section
                                    #follows the Known medical diagnoses section in many cases
                                    if(knownMSMatch.group().find("Operative and Invasive") > knownMSMatch.group().find(specificYr) or knownMSMatch.group().find("Operative and Invasive") == -1):

                                        #if the year is something like '94 we have to figure out if that means 1994 or 2094
                                        #milDecider takes the last two digits of current year
                                        milDecider = int(str(date.today().year)[-2:])
                                        if("'" in specificYr):
                                            #if the last two digits of the year are greater than the last two digits of the current year
                                            #then it's in the 1900's otherwise it's in the 2000's (Anything before 1916 would be classified as 2000's)
                                            if(int(specificYr[-2:]) > milDecider):
                                                specificYr = "19" + specificYr[-2:]
                                            else:
                                                specificYr = "20" + specificYr[-2:]

                                        calledRecord.calledYear = specificYr
                                        calledRecord.calledText = knownMSMatch.group()
                                        return calledRecord

                        #this particular split helps occasionally. It's a simpler version of splitting by sentence.
                        #In general, narrowing down the amount of text I searched for the diagnosis year helped
                        #reduce false positives
                        splitMSMatch = splitDiagnoseMatch.split('.')
                        for splitMatch in splitMSMatch:
                            #search for negating language
                            #look for negating language
                            negRegex = r'\sdoes\snot\s|\scan\'t\s|\swill\snot\s|\scannot\s|\scan\snot\s|\swon\'t\s|\sruledout\s|\sruled\sout\s'
                            negMatch = re.search(negRegex, splitMatch, re.IGNORECASE)

                            #if negating language is found than move on to the next sentence
                            if(negMatch):
                                continue

                            #This is another hard return. Looks for diagnosed in (year) within that sentence that also mentions MS
                            diagnosedInRegex = "diagnosed\sin\s(19|20)\d{2}"
                            diagnosedInMatch = re.search(diagnosedInRegex, splitMatch, re.IGNORECASE)
                            if(diagnosedInMatch):
                                #this is a hard return but we don't do a hard call which would automatically call the diagnosis year as this year
                                calledRecord.calledYear = diagnosedInMatch.group()[-4:]
                                calledRecord.calledText = splitMatch
                                return calledRecord

                            ### Relative date wording section ###
                            #The only relative wording that I found helpful was looking for
                            #years ago and then calling that year
                            yearsAgoRegex = r"(\d{1,2})\syears\sago"
                            newMatch = re.search(yearsAgoRegex, splitMatch, re.IGNORECASE)
                            if(newMatch):
                                yearsAgo = int(newMatch.group().split(' ')[0])
                                yearsAgoYr = entry_year - yearsAgo
                                calledMap = {'calledYear': str(yearsAgoYr), 'calledText': splitMatch}
                                yearMaps.append(calledMap)

                            ### Specific year section ###
                            #This section was much more helpful than relative wording. It looks for specific years
                            #mentioned in the same context as MS. This does it for each year in that sentence
                            yearRegex = ".{0,2}(19|20|\')\d{2}.{0,2}"
                            specificYrMatches = re.finditer(yearRegex, splitMatch, re.IGNORECASE)
                            for specificYrMatch in specificYrMatches:
                                #years inside [] or () in that same sentence were often not the right year
                                #because they were discussing something else
                                weedOutRegex = "\[|\]|\(|\)"
                                weedOutMatch = re.search(weedOutRegex, specificYrMatch.group(), re.IGNORECASE)
                                if(weedOutMatch):
                                    #If that was found, skip this year match
                                    continue

                                #if an s or an ' appears after the number, ignore it because it most likely is
                                #saying early/late in that decade(i.e. 1970s) which isn't specific enough

                                #gets the letter directly after the year
                                charAfterYr = specificYrMatch.group()[2:-1][-1:]
                                if(charAfterYr == '\'' or charAfterYr == 's'):
                                    continue

                                #we do the year regex again but this time just the year and not the characters before or after
                                #because at this point we know that the year is a real and specific year
                                yearRegex = "(19|20|\')\d{2}"
                                specificYr = re.search(yearRegex, specificYrMatch.group()).group()

                                #much like mentioned previously, we look for this phrase because it often
                                #threw us off track
                                if(splitMatch.find("Operative and Invasive") > splitMatch.find(specificYr) or splitMatch.find("Operative and Invasive") == -1):
                                    #if the year is something like '94 we have to figure out if that means 1994 or 2094
                                    #milDecider takes the last two digits of current year
                                    milDecider = int(str(date.today().year)[-2:])
                                    if("'" in specificYr):
                                        #if the last two digits of the year are greater than the last two digits of the current year
                                        #then it's in the 1900's otherwise it's in the 2000's (Anything before 1916 would be classified as 2000's)
                                        if(int(specificYr[-2:]) > milDecider):
                                            specificYr = "19" + specificYr[-2:]
                                        else:
                                            specificYr = "20" + specificYr[-2:]

                                    #search for dating back to language (doesn't find much)
                                    datesBackRegex = "dat[ie][nsd][g]?\sback\sto"
                                    dateMatch = re.search(datesBackRegex, splitMatch, re.IGNORECASE)
                                    if(dateMatch):
                                        calledMap = {'calledYear': str(specificYr), 'calledText': splitMatch}
                                        yearMaps.append(calledMap)

                                    #search for symptoms began language (doesn't find much)
                                    beganRegex = "(symptoms|symptom)\sbegan"
                                    beganMatch = re.search(beganRegex, splitMatch, re.IGNORECASE)
                                    if(beganMatch):
                                        calledMap = {'calledYear': str(specificYr), 'calledText': splitMatch}
                                        yearMaps.append(calledMap)

                                    #look for diagnos-ish words but ignore everything after the end of a sentence
                                    #this is a last ditch attempt to ensure we didn't miss anything by looking for any variation
                                    #of diagnosis again in the proximity of the year we found earlier
                                    splitMSMatch = splitMatch.split('.')
                                    for splitMatchDiag in splitMSMatch:
                                        diagnosRegex = "diagnos."
                                        diagnosMatch = re.search(diagnosRegex, splitMatchDiag, re.IGNORECASE)
                                        if(diagnosMatch):
                                            yearRegexCheck = re.search(specificYr, splitMatchDiag, re.IGNORECASE)
                                            if(yearRegexCheck):
                                                calledMap = {'calledYear': str(specificYr), 'calledText': splitMatch}
                                                yearMaps.append(calledMap)



            #if we found at least one diagnosis year in the record
            if(len(yearMaps) > 0):
                #find out the most common year repeated in this one record, ties are broken by later year
                yearMaps =  sorted(yearMaps, key=itemgetter('calledYear'), reverse=True)

                #this bit of code determines the most frequent diagnosis year used in this record
                commonYr = 0000
                count = 0
                for yearMap in reversed(yearMaps):
                    inLoopCount = 0
                    for yearMapOth in reversed(yearMaps):
                        if(yearMap['calledYear'] == yearMapOth['calledYear']):
                            inLoopCount += 1
                    if(inLoopCount > count):
                        count = inLoopCount
                        commonYr = yearMap['calledYear']

                calledRecord.calledYear = commonYr

                #used to determine the accuracy of the algorithm since we don't have a
                #specified set of records to train from
                calledText = ""
                #construct all the text used to call this record
                for yearMap in yearMaps:
                    if(yearMap['calledYear'] == commonYr):
                        calledText += yearMap['calledText']
                        calledText += '\t'
                calledRecord.calledText = calledText

                #returning the called record is the same thing as saying that we found a diagnosis year in this record
                return calledRecord

        #when nothing else is found we return False since the record doesn't give a diagnosis year
        return False
Esempio n. 4
0
    def run(self, record, entry_year):
        #some general organizing stuff for later when it comes to outputting record data
        calledRecord = CalledRecordDiagnoseYr(record.ruid, record.entry_date,
                                              record.content)
        #lets me know as I check the accuracy of the algorithm that it was this rule that called the year
        calledRecord.calledRule = self.name

        #matches diagnosis, diagnosed, diagnos or multiple sclerosis or ms and returns the upperlimit amount of characters after and the lowerlimit of characters before
        #I use this kind of regex a lot -> .{0, (some number)}(other regex).{0, (some number)} and all it does is grabs the amount of characters in that direction so
        # .{0,100}diagnos.{0,100} would grab 100 characters before and after any 'diagnos' string
        regex = r'.{0,' + str(self.lowerLimit) + '}diagnos.{0,' + str(
            self.upperLimit) + '}'
        #re.IGNORECASE is really important. It slows down the algorithm A LOT but it greatly increases its accuracy
        diagnoseMatches = re.findall(regex, record.content, re.IGNORECASE)

        #used to find out the most common year repeated in this one record
        yearMaps = []

        for diagnoseMatch in diagnoseMatches:
            #even though it's only around 200 characters total, I found
            #that splitting it up by sentence was still helpful to rule out
            #false positives. We also split it up by new line, >, <, in addition
            #to periods.
            #splitDiagnoseMatches = diagnoseMatch.split('.')
            #I took this sentence split from Dr. Davis's script here remarks are:
            ####Split the note into sentences or phrases. ---- and >< were common things I found that displayed long lists
            ####instead of phrases and made my searching more difficult
            splitDiagnoseMatches = diagnoseMatch.split(
                '(?:\.\s+)|(?:-(?:\s+)?)+|(?:>\s?<)|\\n')
            #for each sentence
            for splitDiagnoseMatch in splitDiagnoseMatches:
                #Check for negating language here. This could be done better but serves our
                #purposes for now
                regex = r'\sno.\s|can\'t|cannot|negative|possible'
                negMatches = re.findall(regex, splitDiagnoseMatch,
                                        re.IGNORECASE)

                #if there aren't any negative language
                if (len(negMatches) == 0):
                    #Checks for wording that needs to appear before any variation of MS
                    #the amount of characters to check preceding MS is degined by lowerlimit at the
                    #start of this record
                    beforeMSRegex = r'.{0,' + str(self.lowerLimit) + '}multiple\ssclerosis|.{0,' \
                    + str(self.lowerLimit) + '}multiplesclerosis|.{0,' \
                    + str(self.lowerLimit) + '}\sMS\s' \
                    + '|.{0,' + str(self.lowerLimit) + '}:MS\s' \
                    + '|.{0,' + str(self.lowerLimit) + '}\sMS\.(?!\s*\*\*NAME)' \
                    + '|.{0,' + str(self.lowerLimit) + '}:MS-.'
                    beforeMSMatches = re.findall(beforeMSRegex,
                                                 splitDiagnoseMatch,
                                                 re.IGNORECASE)
                    #iterate through all the matches
                    for beforeMSMatch in beforeMSMatches:
                        #if the specific "diagnosed in" appears before the year than no need for tie breaker, go with that year
                        diagnosedInRegex = "diagnosed\sin\s(19|20)\d{2}"
                        #re.search is different than re.findall in that it only looks for the first match and then returns out of it
                        #in order to read that match you access it with .group()
                        diagnosedInMatch = re.search(diagnosedInRegex,
                                                     beforeMSMatch,
                                                     re.IGNORECASE)
                        if (diagnosedInMatch):
                            #this is a hard return meaning that it calls the diagnosis year right here and now
                            #gets the last 4 letters of that match which we know from the regex it's the year
                            calledRecord.calledYear = diagnosedInMatch.group(
                            )[-4:]
                            calledRecord.calledText = beforeMSMatch
                            #we don't use hardCall here because we want this to be included with a common consensus
                            #year calculation made in identifyDiagnosisYear.py
                            return calledRecord

                    #special cases of wording. We completely return out if these specific wordings are found
                    specialMSRegex = r'(diagnosis\sof\s)((ms|multiplesclerosis|multiple\ssclerosis|))(\swas\smade\sin\s(19|20)\d{2})'
                    specialMSMatch = re.search(specialMSRegex,
                                               splitDiagnoseMatch,
                                               re.IGNORECASE)
                    if (specialMSMatch):
                        #this is a hard return meaning that it calls the diagnosis year right here and now
                        #gets the last 4 letters of that match which we know from the regex it's the year
                        calledRecord.calledYear = specialMSMatch.group()[-4:]
                        calledRecord.calledText = specialMSMatch.group()
                        #.hardCall = True is used in identifyDiagnosisYear.py to ignore finding a common consensus year
                        #and to just go with this year found here
                        calledRecord.hardCall = True
                        return calledRecord

                    specialMSRegex = r'diagnosed\swith\s(multiple\ssclerosis|MS|multiplesclerosis)\sin\s(19|20)\d{2}'
                    specialMSMatch = re.search(specialMSRegex,
                                               splitDiagnoseMatch,
                                               re.IGNORECASE)
                    if (specialMSMatch):
                        #this is a hard return meaning that it calls the diagnosis year right here and now
                        calledRecord.calledYear = specialMSMatch.group()[-4:]
                        calledRecord.calledText = specialMSMatch.group()
                        #.hardCall = True is used in identifyDiagnosisYear.py to ignore finding a common consensus year
                        #and to just go with this year found here
                        calledRecord.hardCall = True
                        return calledRecord

                    #search for any variation of MS in the sentence
                    regex =  r'multiple\ssclerosis|' \
                    + 'multiplesclerosis|' \
                    + '\sMS\s' \
                    + '|:MS\s' \
                    + '|\sMS\.(?!\s*\*\*NAME)' \
                    + '|:MS-'
                    MSFound = re.search(regex, splitDiagnoseMatch,
                                        re.IGNORECASE)
                    if (MSFound):
                        #if the phrase "Known Significant Medical Diagnoses and Conditions:" appears in the match
                        #make sure the year is in very close proximity to MS. This means we are in the header and
                        #we need to be careful that the year corresponds with the mention of MS
                        knownDiagnosesRegex = r'known\ssignificant\smedical\sdiagnoses\s'
                        knownDiagnosesMatch = re.search(
                            knownDiagnosesRegex, splitDiagnoseMatch,
                            re.IGNORECASE)
                        if (knownDiagnosesMatch):
                            knownMSMatch = re.search(self.smallerBoundsMSRegex,
                                                     splitDiagnoseMatch,
                                                     re.IGNORECASE)
                            if (knownMSMatch):
                                yearRegex = "(19|20|\')\d{2}"
                                specificYrMatch = re.search(
                                    yearRegex, knownMSMatch.group())
                                if (specificYrMatch):
                                    specificYr = specificYrMatch.group()

                                    #don't take a year after this certain phrase because it's going to be wrong. This section
                                    #follows the Known medical diagnoses section in many cases
                                    if (knownMSMatch.group().find(
                                            "Operative and Invasive"
                                    ) > knownMSMatch.group().find(specificYr)
                                            or knownMSMatch.group().find(
                                                "Operative and Invasive")
                                            == -1):

                                        #if the year is something like '94 we have to figure out if that means 1994 or 2094
                                        #milDecider takes the last two digits of current year
                                        milDecider = int(
                                            str(date.today().year)[-2:])
                                        if ("'" in specificYr):
                                            #if the last two digits of the year are greater than the last two digits of the current year
                                            #then it's in the 1900's otherwise it's in the 2000's (Anything before 1916 would be classified as 2000's)
                                            if (int(specificYr[-2:]) >
                                                    milDecider):
                                                specificYr = "19" + specificYr[
                                                    -2:]
                                            else:
                                                specificYr = "20" + specificYr[
                                                    -2:]

                                        calledRecord.calledYear = specificYr
                                        calledRecord.calledText = knownMSMatch.group(
                                        )
                                        return calledRecord

                        #this particular split helps occasionally. It's a simpler version of splitting by sentence.
                        #In general, narrowing down the amount of text I searched for the diagnosis year helped
                        #reduce false positives
                        splitMSMatch = splitDiagnoseMatch.split('.')
                        for splitMatch in splitMSMatch:
                            #search for negating language
                            #look for negating language
                            negRegex = r'\sdoes\snot\s|\scan\'t\s|\swill\snot\s|\scannot\s|\scan\snot\s|\swon\'t\s|\sruledout\s|\sruled\sout\s'
                            negMatch = re.search(negRegex, splitMatch,
                                                 re.IGNORECASE)

                            #if negating language is found than move on to the next sentence
                            if (negMatch):
                                continue

                            #This is another hard return. Looks for diagnosed in (year) within that sentence that also mentions MS
                            diagnosedInRegex = "diagnosed\sin\s(19|20)\d{2}"
                            diagnosedInMatch = re.search(
                                diagnosedInRegex, splitMatch, re.IGNORECASE)
                            if (diagnosedInMatch):
                                #this is a hard return but we don't do a hard call which would automatically call the diagnosis year as this year
                                calledRecord.calledYear = diagnosedInMatch.group(
                                )[-4:]
                                calledRecord.calledText = splitMatch
                                return calledRecord

                            ### Relative date wording section ###
                            #The only relative wording that I found helpful was looking for
                            #years ago and then calling that year
                            yearsAgoRegex = r"(\d{1,2})\syears\sago"
                            newMatch = re.search(yearsAgoRegex, splitMatch,
                                                 re.IGNORECASE)
                            if (newMatch):
                                yearsAgo = int(newMatch.group().split(' ')[0])
                                yearsAgoYr = entry_year - yearsAgo
                                calledMap = {
                                    'calledYear': str(yearsAgoYr),
                                    'calledText': splitMatch
                                }
                                yearMaps.append(calledMap)

                            ### Specific year section ###
                            #This section was much more helpful than relative wording. It looks for specific years
                            #mentioned in the same context as MS. This does it for each year in that sentence
                            yearRegex = ".{0,2}(19|20|\')\d{2}.{0,2}"
                            specificYrMatches = re.finditer(
                                yearRegex, splitMatch, re.IGNORECASE)
                            for specificYrMatch in specificYrMatches:
                                #years inside [] or () in that same sentence were often not the right year
                                #because they were discussing something else
                                weedOutRegex = "\[|\]|\(|\)"
                                weedOutMatch = re.search(
                                    weedOutRegex, specificYrMatch.group(),
                                    re.IGNORECASE)
                                if (weedOutMatch):
                                    #If that was found, skip this year match
                                    continue

                                #if an s or an ' appears after the number, ignore it because it most likely is
                                #saying early/late in that decade(i.e. 1970s) which isn't specific enough

                                #gets the letter directly after the year
                                charAfterYr = specificYrMatch.group(
                                )[2:-1][-1:]
                                if (charAfterYr == '\'' or charAfterYr == 's'):
                                    continue

                                #we do the year regex again but this time just the year and not the characters before or after
                                #because at this point we know that the year is a real and specific year
                                yearRegex = "(19|20|\')\d{2}"
                                specificYr = re.search(
                                    yearRegex,
                                    specificYrMatch.group()).group()

                                #much like mentioned previously, we look for this phrase because it often
                                #threw us off track
                                if (splitMatch.find("Operative and Invasive") >
                                        splitMatch.find(specificYr)
                                        or splitMatch.find(
                                            "Operative and Invasive") == -1):
                                    #if the year is something like '94 we have to figure out if that means 1994 or 2094
                                    #milDecider takes the last two digits of current year
                                    milDecider = int(
                                        str(date.today().year)[-2:])
                                    if ("'" in specificYr):
                                        #if the last two digits of the year are greater than the last two digits of the current year
                                        #then it's in the 1900's otherwise it's in the 2000's (Anything before 1916 would be classified as 2000's)
                                        if (int(specificYr[-2:]) > milDecider):
                                            specificYr = "19" + specificYr[-2:]
                                        else:
                                            specificYr = "20" + specificYr[-2:]

                                    #search for dating back to language (doesn't find much)
                                    datesBackRegex = "dat[ie][nsd][g]?\sback\sto"
                                    dateMatch = re.search(
                                        datesBackRegex, splitMatch,
                                        re.IGNORECASE)
                                    if (dateMatch):
                                        calledMap = {
                                            'calledYear': str(specificYr),
                                            'calledText': splitMatch
                                        }
                                        yearMaps.append(calledMap)

                                    #search for symptoms began language (doesn't find much)
                                    beganRegex = "(symptoms|symptom)\sbegan"
                                    beganMatch = re.search(
                                        beganRegex, splitMatch, re.IGNORECASE)
                                    if (beganMatch):
                                        calledMap = {
                                            'calledYear': str(specificYr),
                                            'calledText': splitMatch
                                        }
                                        yearMaps.append(calledMap)

                                    #look for diagnos-ish words but ignore everything after the end of a sentence
                                    #this is a last ditch attempt to ensure we didn't miss anything by looking for any variation
                                    #of diagnosis again in the proximity of the year we found earlier
                                    splitMSMatch = splitMatch.split('.')
                                    for splitMatchDiag in splitMSMatch:
                                        diagnosRegex = "diagnos."
                                        diagnosMatch = re.search(
                                            diagnosRegex, splitMatchDiag,
                                            re.IGNORECASE)
                                        if (diagnosMatch):
                                            yearRegexCheck = re.search(
                                                specificYr, splitMatchDiag,
                                                re.IGNORECASE)
                                            if (yearRegexCheck):
                                                calledMap = {
                                                    'calledYear':
                                                    str(specificYr),
                                                    'calledText': splitMatch
                                                }
                                                yearMaps.append(calledMap)

            #if we found at least one diagnosis year in the record
            if (len(yearMaps) > 0):
                #find out the most common year repeated in this one record, ties are broken by later year
                yearMaps = sorted(yearMaps,
                                  key=itemgetter('calledYear'),
                                  reverse=True)

                #this bit of code determines the most frequent diagnosis year used in this record
                commonYr = 0000
                count = 0
                for yearMap in reversed(yearMaps):
                    inLoopCount = 0
                    for yearMapOth in reversed(yearMaps):
                        if (yearMap['calledYear'] == yearMapOth['calledYear']):
                            inLoopCount += 1
                    if (inLoopCount > count):
                        count = inLoopCount
                        commonYr = yearMap['calledYear']

                calledRecord.calledYear = commonYr

                #used to determine the accuracy of the algorithm since we don't have a
                #specified set of records to train from
                calledText = ""
                #construct all the text used to call this record
                for yearMap in yearMaps:
                    if (yearMap['calledYear'] == commonYr):
                        calledText += yearMap['calledText']
                        calledText += '\t'
                calledRecord.calledText = calledText

                #returning the called record is the same thing as saying that we found a diagnosis year in this record
                return calledRecord

        #when nothing else is found we return False since the record doesn't give a diagnosis year
        return False
 def run(self, record, entry_year):
     calledRecord = CalledRecordDiagnoseYr(record.ruid, record.entry_date, record.content)
     calledRecord.calledRule = self.name
     record = record.content