def re_find_data_nopattern(self): content1 = htmlcon(self.html) Hold = content1.read_html2().decode('utf-8') if re.search(r'\<html\>', Hold, re.IGNORECASE) is None: Hold1 = Hold.split() a = " ".join(line.strip() for line in Hold1) paragraphs = re.findall(r'\d+\.\d+%|\d+%|\.\d+%', a, re.IGNORECASE) p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', a, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] return paragraphs ###if there are tags in html### else: content1 = htmlcon(self.html) Hold = content1.read_html2().decode('utf-8') Hold1 = strip_tags(Hold) Hold2 = Hold1.split() a = " ".join(line.strip() for line in Hold2) paragraphs = re.findall(r'\d+\.\d+%|\d+%|\.\d+%', a, re.IGNORECASE) p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', a, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] return paragraphs
def re_find_data(self): content1 = htmlcon(self.html) Hold = content1.read_html2().decode('utf-8') ###if there is no tag in html### if re.search(r'\<html\>', Hold, re.IGNORECASE) is None: a = "".join(line.strip() for line in Hold) paragraphs2 = re.findall(r'Item4(.*?)\(c\)', a, re.IGNORECASE) b = "".join(line.strip() for line in paragraphs2) paragraphs = re.findall(r'\d+\.\d+%|\d+%|\.\d+%', b, re.IGNORECASE) if len(paragraphs) == 0: paragraphs = re.findall(r'\d+\.\d+|\.\d+', b, re.IGNORECASE) paragraphs = [z + "%" for z in paragraphs] p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', b, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] return paragraphs ###if there are tags in html### else: content1 = htmlcon(self.html) Hold = content1.read_html2().decode('utf-8') Hold1 = strip_tags(Hold) a = "".join(line.strip() for line in Hold1) paragraphs2 = re.findall(r'Item4(.*?)\(c\)', a, re.IGNORECASE) b = "".join(line.strip() for line in paragraphs2) paragraphs = re.findall(r'\d+\.\d+%|\d+%|\.\d+%', b, re.IGNORECASE) if len(paragraphs) == 0: paragraphs = re.findall(r'\d+\.\d+|\.\d+', b, re.IGNORECASE) paragraphs = [z + "%" for z in paragraphs] p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', b, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] return paragraphs
def content_words(self): content1 = htmlcon(self.html1) Hold = content1.read_html1().decode("utf-8") Hnew1 = strip_tags(Hold) Hnew = re.split("\n+", Hnew1) words1 = [] lines = [line.strip() for line in Hnew] for a in lines: for word in a.split(" "): words1.append(word) words1[:] = [word for word in words1 if word != ''] return words1
def re_find_data_with_percentage_character(self): content1 = htmlcon(self.html) Hold = content1.read_html2().decode('utf-8') ###if there is no tag in html### if re.search(r'\<html\>', Hold, re.IGNORECASE) is None: a = "".join(line.strip() for line in Hold) paragraphs2 = re.findall(r'Item4(.*?)\(c\)', a, re.IGNORECASE) b = "".join(line.strip() for line in paragraphs2) paragraphs = re.findall(r'(\d+\.\d+|\d+|\.\d+)percent', b, re.IGNORECASE) ###extract the number after INROW9### if len(paragraphs) == 0: paragraphs = re.findall(r'INROW9(\d+\.\d+|\d+|\.\d+)', a, re.IGNORECASE) if len(paragraphs) == 0: paragraphs = re.findall(r'INROW\(9\)(\d+\.\d+|\d+|\.\d+)', a, re.IGNORECASE) ###check as below### if len(paragraphs) != 0: for u in range(len(paragraphs)): if len(paragraphs[u]) >= 2: if paragraphs[u][-1] == "2" and paragraphs[u][ -2] == "1": paragraphs = [] break ###check whether the first number of each element is 0### ###if the last two numbers are 1 and 2### ###it means the number after row9(without %) could be linked with the following numbers### ###which can make the number not eccuate### else: for o in range(len(paragraphs)): if len(paragraphs[o]) >= 2: if paragraphs[o].strip( )[-1] == "2" and paragraphs[o].strip()[-2] == "1": paragraphs = [] break paragraphs = [z + "%" for z in paragraphs] p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', b, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] return paragraphs ###if there are tags in html### else: content1 = htmlcon(self.html) Hold = content1.read_html2().decode('utf-8') Hold1 = strip_tags(Hold) a = "".join(line.strip() for line in Hold1) paragraphs2 = re.findall(r'Item4(.*?)\(c\)', a, re.IGNORECASE) b = "".join(line.strip() for line in paragraphs2) paragraphs = re.findall(r'(\d+\.\d+|\d+|\.\d+)percent', b, re.IGNORECASE) ###extract the number after INROW9### if len(paragraphs) == 0: paragraphs = re.findall(r'INROW9(\d+\.\d+|\d+|\.\d+)', a, re.IGNORECASE) if len(paragraphs) == 0: paragraphs = re.findall(r'INROW\(9\)(\d+\.\d+|\d+|\.\d+)', a, re.IGNORECASE) ###check as below### if len(paragraphs) != 0: for u in range(len(paragraphs)): if len(paragraphs[u]) >= 2: if paragraphs[u][-1] == "2" and paragraphs[u][ -2] == "1": paragraphs = [] break ###check whether the first number of each element is 0### ###if the last two numbers are 1 and 2### ###it means the number after row9(without %) could be linked with the following numbers### ###which can make the number not eccuate### else: for o in range(len(paragraphs)): if len(paragraphs[o]) >= 2: if paragraphs[o].strip( )[-1] == "2" and paragraphs[o].strip()[-2] == "1": paragraphs = [] break paragraphs = [z + "%" for z in paragraphs] p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', b, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] return paragraphs
def check_function_find_data(self): content1 = htmlcon(self.html) Hold1 = content1.read_html2().decode('utf-8') if re.search(r'\<html\>', Hold1, re.IGNORECASE) is None: a = "".join(line.strip() for line in Hold1) ###expand the keywords range from "item4 to (c)" to "item4 to Item 5"### ###than extract the percentage between the keywords### paragraphs2 = re.findall(r'Item4(.*?)Item5', a, re.IGNORECASE) b = "".join(line.strip() for line in paragraphs2) paragraphs = re.findall(r'\d+\.\d+%|\d+%|\.\d+%', b, re.IGNORECASE) p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', b, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] ###check if there is some keywords between item4 and (c) that represent### ###percentage is the sum of all the percentages### ###the keword that I found is: in the aggregate### ###example: https://www.sec.gov/Archives/edgar/data/101984/0000910643-00-000009.txt### aggregate_per1 = re.findall(r'Item4(.*?)\(c\)', a, re.IGNORECASE) c = "".join(line.strip() for line in aggregate_per1) aggregate_per = re.findall(r'(intheaggregate)', c, re.IGNORECASE) if len(aggregate_per) != 0: ###strip % out of numbers### d = [ str(paragraphs[p]).strip('%') for p in range(len(paragraphs)) ] ###change the type in list### float_aggregate_per = [float(d[p]) for p in range(len(d))] paragraphs1 = [str(max(float_aggregate_per))] paragraphs = [z + "%" for z in paragraphs1] return paragraphs else: content1 = htmlcon(self.html) Hold = content1.read_html2().decode('utf-8') Hold1 = strip_tags(Hold) a = "".join(line.strip() for line in Hold1) paragraphs2 = re.findall(r'Item4(.*?)Item5', a, re.IGNORECASE) b = "".join(line.strip() for line in paragraphs2) paragraphs = re.findall(r'\d+\.\d+%|\d+%|\.\d+%', b, re.IGNORECASE) p = re.findall( r'inexcessof(\d+\.\d+%|\d+%|\.\d+%)oftheoutstanding', b, re.IGNORECASE) if len(p) == 1: paragraphs = [x for x in paragraphs if x != p[0]] aggregate_per1 = re.findall(r'Item4(.*?)\(c\)', a, re.IGNORECASE) c = "".join(line.strip() for line in aggregate_per1) aggregate_per = re.findall(r'(intheaggregate)', c, re.IGNORECASE) if len(aggregate_per) != 0: ###strip % out of numbers### d = [ str(paragraphs[p]).strip('%') for p in range(len(paragraphs)) ] ###change the type in list### float_aggregate_per = [float(d[p]) for p in range(len(d))] paragraphs1 = [str(max(float_aggregate_per))] paragraphs = [z + "%" for z in paragraphs1] return paragraphs