def parseInReStyle_LB(rawParties): caseName = '' #TESTING #print("IN RE***************************************\nOriginal:", rawParties) #First, make iniital cut (only words that will NOT appear in case name) for word in inReFirstCut: rawParties = rawParties.replace(word, '') caseName = rawParties.strip() #Second, make second word cut (for cut down to party list) for word in inReSecondCut: rawParties = rawParties.replace(word, '') #Third, minor formatting rawParties = rawParties.strip() caseName = caseName.strip() if rawParties.endswith(' .'): rawParties = rawParties[:-2] if rawParties.endswith(','): rawParties = rawParties[:-1] if caseName.endswith(', .'): caseName = caseName[:-3] #Fourth, strip starting and ending periods rawParties = stripPeriods(rawParties) caseName = stripPeriods(caseName) #TESTING #print('\nCase Name:', caseName) #print("\nParty List:", rawParties) return rawParties, caseName
def parseInReStyle_text(rawParties): caseName = '' #TESTING #print("IN RE***************************************\nOriginal:", rawParties) #First, make iniital cut (only words that will NOT appear in case name) caseName = cutWords2(inReFirstCut, rawParties) caseName = removeDates(caseName) #Second, make second word cut (for cut down to party list) rawParties = cutWords2(inReSecondCut, caseName) #Third, minor formatting rawParties = rawParties.strip() caseName = caseName.strip() if rawParties.endswith(' .'): rawParties = rawParties[:-2] if rawParties.endswith(','): rawParties = rawParties[:-1] if caseName.endswith(', .'): caseName = caseName[:-3] #Fourth, strip starting and ending periods rawParties = stripPeriods(rawParties) caseName = stripPeriods(caseName) return rawParties, caseName
def parseVstyle_LB(rawParties): #TESTING #print("-----------------------------------------------\nOriginal:", rawParties) #First, change 'v.'s to carrots and deal with breaks rawParties = rawParties.replace('<br>v.<br>', ', ^') rawParties = rawParties.replace('<br>\nv.<br>', ', ^') rawParties = rawParties.replace(', and<br>\n', ', ') rawParties = rawParties.replace('<br>', ', ') # NEW ADDITION!!!!!!!!!!!!!!!!! # NEW ADDITION!!!!!!!!!!!!!!!!! rawParties = rawParties.replace( ' v. ', ', ') #MUST be case sensitive search and replace #Second, deal with special "and" party names, e.g., 'oil and gas company' rawParties = addAmpersands(rawParties) #Third, remove unwanted text for word in partyWordsToRemove: partyRegex = re.compile(re.escape(word), re.IGNORECASE) rawParties = partyRegex.sub('', rawParties) #Fourth, fix double/triple commas (they could f**k with next step) rawParties = rawParties.replace(',,,', ',') rawParties = rawParties.replace(',,', ',') #Fifth, replace 'and' with commas to separate parties rawParties = rawParties.replace(', and ', ', ') rawParties = rawParties.replace(' and ', ', ') rawParties = rawParties.replace('; And ', ', ') #Sixth, replace commas with semi-colons for final separation of parties tempPartyList = list(rawParties) for i in range(0, len(rawParties) - 2): if (rawParties[i] == ',') and (rawParties[i + 1] == ' '): nextWord = '' for j in range(i + 2, len(rawParties)): if rawParties[j] == ' ': break if rawParties[j] == '<br>': break nextWord += rawParties[j] if nextWord in DNPlist: continue tempPartyList[i] = ';' #Convert tempPartyList into string rawParties = '' for char in tempPartyList: rawParties += char #Seventh, formatting and minor fixes rawParties = rawParties.replace('; ; ', '; ') rawParties = rawParties.replace('; ; ', '; ') rawParties = rawParties.replace(' ;', ';') rawParties = rawParties.replace(',^', '; ^') rawParties = rawParties.replace('&', '&') rawParties = rawParties.replace(' ', ' ') #Seventh, if applicable, put the first party after the 'v.' in the second #position (so the two parties used for the name are easily available) tempPartyList = rawParties.split(';') partyCounter = 0 afterVpos = -1 for party in tempPartyList: if party.lstrip().startswith('^'): afterVpos = partyCounter break partyCounter += 1 if (len(tempPartyList) > 1) and (afterVpos > 1): temp = tempPartyList[afterVpos] del tempPartyList[afterVpos] tempPartyList.insert(1, temp) #Convert tempPartyList into string rawParties = '' for element in tempPartyList: rawParties += element.strip() + ';' #Eighth, final formatting rawParties = rawParties.replace(';;', ';') rawParties = rawParties.replace(',;', ';') rawParties = rawParties.replace(';.;', ';') rawParties = rawParties.replace(';,;', ';') rawParties = rawParties.replace('^', '') if rawParties.endswith(';'): rawParties = rawParties[:-1] #Ninth, generate the caseName from the partyList string caseName = getCaseNameFromPartyList_LB(rawParties) #Tenth, strip starting and ending periods rawParties = stripPeriods(rawParties) caseName = stripPeriods(caseName) #TESTING #print("\nCase Name:", caseName) #print("\nParty List:", rawParties) return rawParties, caseName
def parseInterestStyle_LB(rawParties): caseName = '' # First, make first cut of words (for caseName) rawParties = cutWords(interestFirstCut, rawParties) #Second, delete some HTML startLoc = rawParties.find('<sup>') endLoc = rawParties.find('</sup>') if (startLoc > -1) and (endLoc > -1): rawParties = rawParties[:startLoc] + rawParties[endLoc + 6:] #Third, formatting for caseName rawParties = rawParties.replace('&', '&') if '<br>\n' in rawParties: caseName = rawParties[:rawParties.find('<br>\n')] else: caseName = rawParties #Fourth, some additional formatting rawParties = rawParties.replace('<br>\n', ';') rawParties = rawParties.replace(';;', ';') #Fifth, make second cut of words (for party list) rawParties = cutWords(interestSecondCut, rawParties) #Sixth, additional formatting rawParties = rawParties.strip() if rawParties.endswith(';'): rawParties = rawParties[:-1] if rawParties.endswith(', .'): rawParties = rawParties[:-3] if rawParties.endswith(', .'): rawParties = rawParties[:-3] caseName = caseName.strip() if caseName.endswith(';'): caseName = caseName[:-1] if caseName.endswith(', '): caseName = caseName[:-2] if caseName.endswith(', .'): caseName = caseName[:-3] caseName = stripPeriods(caseName) #Seventh, deal with special "and" party names, e.g., 'oil and gas company' rawParties = addAmpersands(rawParties) #Eighth, replace 'and' with commas to separate parties (also fix double and triple commas) rawParties = rawParties.replace(',,,', ',') rawParties = rawParties.replace(',,', ',') rawParties = rawParties.replace(', and ', ', ') rawParties = rawParties.replace(' and ', ', ') rawParties = rawParties.replace('; And ', ', ') #Ninth, replace commas with semi-colons for final separation of parties tempPartyList = list(rawParties) for i in range(0, len(rawParties) - 2): if (rawParties[i] == ',') and (rawParties[i + 1] == ' '): nextWord = '' for j in range(i + 2, len(rawParties)): if rawParties[j] == ' ': break if rawParties[j] == '<br>': break nextWord += rawParties[j] if nextWord in DNPlist: continue tempPartyList[i] = ';' #Convert tempPartyList into string rawParties = '' for char in tempPartyList: rawParties += char #Tenth, strip periods rawParties = stripPeriods(rawParties) #Eleventh: strip extra leading and trailing space tempPartyList = rawParties.split(';') rawParties = '' for party in tempPartyList: rawParties += party.strip() + ';' if rawParties.endswith(';'): rawParties = rawParties[:-1] #Twelvth: some more formatting rawParties = rawParties.replace(';;', ';') rawParties = rawParties.replace(';.;', ';') #Thirteenth: eliminate duplicates #de-dupe! #See 6, 7, parties = rawParties return parties, caseName
def parseVstyle_text(rawParties): #First, change 'v.'s to carrots and deal with breaks rawParties = rawParties.replace('<br>v.<br>', ', ^') rawParties = rawParties.replace(' v. ', ', ^') rawParties = rawParties.replace('<br>\nv.<br>', ', ^') rawParties = rawParties.replace(', and<br>\n', ', ') rawParties = rawParties.replace('<br>', ', ') rawParties = rawParties.replace( ' v. ', ', ') #MUST be case sensitive search and replace #Second, deal with special "and" party names, e.g., 'oil and gas company' rawParties = addAmpersands(rawParties) #Third, remove unwanted text rawParties = re.sub( r'appeal of\:', ',', rawParties, flags=re.IGNORECASE ) # Need to do this before next step to maintain proper commas between parties for word in partyWordsToRemove: partyRegex = re.compile(re.escape(word), re.IGNORECASE) rawParties = partyRegex.sub('', rawParties) rawParties = re.sub( r"NO\.\s\d\d\-\d\d\d", ',', rawParties ) # Remove case numbers from PA Superior Court cases (see, e.g., /home/dan/Data/CourtListener/Processed/Pennsylvania/Superior/withCitations/4281696.html) rawParties = re.sub(r"_{3,}", " ", rawParties) # Remove lines made of underscores # Remove workers compensation appeal board parentheticals (see Pennsylvania/Commonwealth/withCitations/4032662.html) rawParties = PAworkersComp(rawParties) #Fourth, fix double/triple commas (they could f**k with next step) rawParties = rawParties.replace(',,,', ',') rawParties = rawParties.replace(',,', ',') #Fifth, replace 'and' with commas to separate parties rawParties = rawParties.replace(', and ', ', ') rawParties = rawParties.replace(' and ', ', ') rawParties = rawParties.replace('; And ', ', ') rawParties = rawParties.replace(' AND ', ', ') #Sixth, replace commas with semi-colons for final separation of parties tempPartyList = list(rawParties) for i in range(0, len(rawParties) - 2): if (rawParties[i] == ',') and (rawParties[i + 1] == ' '): nextWord = '' for j in range(i + 2, len(rawParties)): if rawParties[j] == ' ': break if rawParties[j] == '<br>': break nextWord += rawParties[j] if nextWord in DNPlist: continue tempPartyList[i] = ';' #Convert tempPartyList into string rawParties = '' for char in tempPartyList: rawParties += char #Seventh, formatting and minor fixes rawParties = rawParties.replace('; ; ', '; ') rawParties = rawParties.replace('; ; ', '; ') rawParties = rawParties.replace(' ;', ';') rawParties = rawParties.replace(',^', '; ^') rawParties = rawParties.replace('&', '&') rawParties = rawParties.replace(' ', ' ') #Seventh, if applicable, put the first party after the 'v.' in the second #position (so the two parties used for the name are easily available) tempPartyList = rawParties.split(';') partyCounter = 0 afterVpos = -1 for party in tempPartyList: if party.lstrip().startswith('^'): afterVpos = partyCounter break partyCounter += 1 if (len(tempPartyList) > 1) and (afterVpos > 1): temp = tempPartyList[afterVpos] del tempPartyList[afterVpos] tempPartyList.insert(1, temp) #Convert tempPartyList into string rawParties = '' for element in tempPartyList: rawParties += element.strip() + ';' #Eighth, final formatting rawParties = rawParties.replace(';;;', ';') rawParties = rawParties.replace(';;', ';') rawParties = rawParties.replace(',;', ';') rawParties = rawParties.replace(';.;', ';') rawParties = rawParties.replace(';,;', ';') rawParties = rawParties.replace('^', '') rawParties = rawParties.replace(';/;', ';') rawParties = rawParties.replace('; ', ';') if rawParties.endswith(';'): rawParties = rawParties[:-1] if rawParties.startswith(';'): rawParties = rawParties[1:] #Ninth, generate the caseName from the partyList string caseName = getCaseNameFromPartyList_text(rawParties) #Tenth, strip starting and ending periods rawParties = stripPeriods(rawParties) caseName = stripPeriods(caseName) #Elevent, dedupe party list rawParties = deDupe(rawParties) return rawParties, caseName
def parseInterestStyle_text(rawParties): caseName = '' # First, make first cut of words (for caseName) rawParties = cutWords(interestFirstCut, rawParties) #Second, delete some HTML startLoc = rawParties.find('<sup>') endLoc = rawParties.find('</sup>') if (startLoc > -1) and (endLoc > -1): rawParties = rawParties[:startLoc] + rawParties[endLoc + 6:] #Third, formatting for caseName rawParties = rawParties.replace('&', '&') if '<br>\n' in rawParties: caseName = rawParties[:rawParties.find('<br>\n')] else: caseName = rawParties if ' appeal of:' in caseName.lower(): position = caseName.lower().find(' appeal of') caseName = caseName[:position] #Fourth, some additional formatting rawParties = rawParties.replace('<br>\n', ';') rawParties = rawParties.replace(';;', ';') #Fifth, make second cut of words (for party list) rawParties = cutWords(interestSecondCut, rawParties) #Sixth, additional formatting rawParties = rawParties.strip() if rawParties.endswith(';'): rawParties = rawParties[:-1] if rawParties.endswith(', .'): rawParties = rawParties[:-3] if rawParties.endswith(', .'): rawParties = rawParties[:-3] caseName = caseName.strip() if caseName.endswith(';'): caseName = caseName[:-1] if caseName.endswith(', '): caseName = caseName[:-2] if caseName.endswith(', .'): caseName = caseName[:-3] caseName = stripPeriods(caseName) #Seventh, deal with special "and" party names, e.g., 'oil and gas company' rawParties = addAmpersands(rawParties) #Eighth, replace 'and' with commas to separate parties (also fix double and triple commas) rawParties = rawParties.replace(',,,', ',') rawParties = rawParties.replace(',,', ',') rawParties = rawParties.replace(', and ', ', ') rawParties = rawParties.replace(' and ', ', ') rawParties = rawParties.replace('; And ', ', ') #Ninth, replace commas with semi-colons for final separation of parties rawParties = sepParties(rawParties) #Tenth, strip periods, leading space, and trailingspace rawParties = stripPeriods(rawParties) rawParties = stripSpaces(rawParties) #Eleventh: some more formatting rawParties = rawParties.replace(';;', ';') rawParties = rawParties.replace(';.;', ';') #Twelvth: de-dupe the party list for exact duplicates parties = deDupe(rawParties) return parties, caseName