Python Extractor.return_list Examples, html_table_extractor.extractor.Extractor.return_list Python Examples

Example #1

0

Show file

File: tests.py Project: yuanxu-li/html-table-extractor

 def test_init_with_id(self):
     html = """
     <body>
       <table id='wanted'>
         <tr>
           <td>1</td>
           <td>2</td>
         </tr>
         <tr>
           <td>3</td>
           <td>4</td>
         </tr>
       </table>
       <table id='unwanted'>
         <tr>
           <td>unwanted</td>
         </tr>
       </table>
     </body>
     """
     soup = BeautifulSoup(html, 'html.parser')
     extractor = Extractor(soup, id_='wanted').parse()
     self.assertEqual(
         extractor.return_list(),
         [[u'1', u'2'], [u'3', u'4']]
     )

Example #2

0

Show file

def test_tune_parameter(f, numbers):
    """
    :param f: the category like 10th has 'A' while 'B' is for science etc. given as first letter of the seat number
    :param numbers:  the remaining 7 letters of the  seat numbers all provided in a list e.g:[3413535,1355151,3153153..]
    :return: None . This function is just used to tune the values in the main function so we can get indexes of marks
    etc as table structure may change.
    """
    goto('http://www.gseb.org/')
    for loop in numbers:
        driver.find_element_by_xpath(
            "//select[@name='drpInitChar']/option[text()='" + f +
            "']").click()
        username = driver.find_element_by_name("studentnumber")
        seat_no = loop
        username.send_keys(seat_no)
        go = driver.find_element_by_name("go")
        go.click()
        time.sleep(1)
        driver.switch_to.frame("marksheet")
        soup = BeautifulSoup(driver.page_source)
        table = soup.find("table", attrs={"class": "maintbl"})
        extractor = Extractor(table)
        extractor.parse()
        l = extractor.return_list()
        print(l)

Example #3

0

Show file

File: tests.py Project: yuanxu-li/html-table-extractor

class TestComplexExtractor(unittest.TestCase):
    def setUp(self):
        html = """
        <table>
            <tr>
                <td rowspan=2>1</td>
                <td>2</td>
                <td>3</td>
            </tr>
            <tr>
                <td colspan=2>4</td>
            </tr>
            <tr>
                <td colspan=3>5</td>
            </tr>
        </table>
        """
        self.extractor = Extractor(html)
        self.extractor.parse()

    def test_return_list(self):
        self.assertEqual(
            self.extractor.return_list(),
            [[u'1', u'2', u'3'], [u'1', u'4', u'4'], [u'5', u'5', u'5']]
        )

Example #4

0

Show file

def get_gseb_results_10(numbers, output_file_path, subjects):
    goto('http://gseb.org/indexssc.html')
    df = pd.DataFrame()
    for loop in numbers:
        detail = {}
        username = driver.find_element_by_name("studentnumber")
        seat_no = loop
        detail['seat_no'] = seat_no
        username.send_keys(seat_no)
        go = driver.find_element_by_name("go")
        go.click()
        time.sleep(1)
        driver.switch_to.frame("marksheet")
        soup = BeautifulSoup(driver.page_source)
        table = soup.find("table", attrs={"class": "maintbl"})
        extractor = Extractor(table)
        extractor.parse()
        l = extractor.return_list()
        detail['Name'] = find(l[0][0], "Name: ")
        detail['result'] = find(l[1][0],
                                "Result: ").strip('School Index: 55.224')
        for i in range(len(subjects)):
            detail[subjects[i] + ' in external'] = l[i + 3][1]
            detail[subjects[i] + ' in internal'] = l[i + 3][2]
            detail[subjects[i] + ' in total'] = l[i + 3][3]
            detail[subjects[i] + ' grade'] = l[i + 3][4]
        detail['Total'] = int(l[9][1].split()[0])
        detail['Overall Grade'] = l[17][0]
        detail['Percentile Rank'] = l[17][2]
        df = df.append(detail, ignore_index=True)
        driver.switch_to_default_content()
        driver.find_element_by_name("studentnumber").clear()
    print(df)
    df.to_csv(output_file_path, index=False)
    driver.close()

Example #5

0

Show file

File: basilicata.py Project: n28div/itAIRQ

    def _fetch_air_quality_routine(self, day: datetime):
        """
        Populate the air quality of the provinces
        Fetches data from `http://www.arpab.it/aria/qa.asp`

        :param day: The day of which the air quality wants to be known (instance of `~datetime`)
        """
        super()._fetch_air_quality_routine(day)

        res = requests.get('http://www.arpab.it/aria/qa.asp',
            params=[
                ('giorno', day.strftime('%d/%m/%Y'))
            ]
        )

        soup = BeautifulSoup(res.text, 'html.parser')
        table = soup.select_one('.tabellenav')

        if table is not None:
            extractor = Extractor(table)
            extractor.parse()
            table_data = extractor.return_list()[1:]

            for province in self.provinces:
                province_rows = [x for idx, x in enumerate(table_data) 
                                 if idx in self.province_stations[province.short_name]]
                
                for indicator, key in self.indicator_map.items():
                    values = [self.extract_float(x[key]) for x in province_rows
                              if self.extract_float(x[key]) is not None]
                    
                    if len(values) > 0:
                        setattr(province.quality, indicator, round(mean(values), 2))

        if self.on_quality_fetched is not None: self.on_quality_fetched(self)

Example #6

0

Show file

File: helper.py Project: Allexeyv/projects

def get_text_data(html):
    extractor = Extractor(html)
    extractor.parse()
    table = extractor.return_list()
    del table[0]
    for i in range(len(table)):
        a = table[i]
        del a[0]
        del a[3]
    return table

Example #7

0

Show file

File: main.py Project: chunlaw/HKCourtArchive

    def parseTable(self, table):
        self.parseHeader(table)
        # skip if header not found
        if self.headers == []:
            return []

        # parse the table, split merged cell and fill in value
        extractor = Extractor(str(table), transformer=unicode)
        extractor.parse()
        raw_data = extractor.return_list()

        # fill the empty cell by previous row value
        for i in xrange(0, len(raw_data)):
            for j in xrange(0, len(raw_data[i])):
                if raw_data[i][j].strip() == '':
                    raw_data[i][j] = raw_data[i - 1][j]

        # select rows if there time qualifier appeared
        raw_data = [
            a for a in raw_data
            if (''.join(a)).find(u'上午') != -1 or (''.join(a)).find(u'下午') != -1
        ]

        # assigning columns with header as key
        data = []
        # printJson( self.headers )
        for i in xrange(0, len(raw_data)):
            data.append({})
            raw_data[i] = self.unique(raw_data[i])  # handle colspan here
            if len(self.headers) != len(
                    raw_data[i]
            ):  # skip unformat row, likely to be the ending row
                continue
            for j in xrange(0, len(raw_data[i])):
                data[i][self.headers[j]] = raw_data[i][j]

        # mergeing cells content if the case no. is the same
        content = []
        for i in xrange(0, len(data)):
            if not (u'案件號碼' in data[i] or u'案件編號' in data[i]):
                continue
            if content == [] or \
                ( u'案件號碼' in data[i] and content[-1][u'案件號碼'].strip() != data[i][u'案件號碼'].strip() and content[-1][u'案件號碼'].strip().strip(u'─') != '' ) or \
                ( u'案件編號' in data[i] and content[-1][u'案件編號'].strip() != data[i][u'案件編號'].strip() and content[-1][u'案件編號'].strip().strip(u'─') != '' ):
                content.append(data[i])
            else:
                #print json.dumps(content[-1], ensure_ascii=False, indent=4)
                #print json.dumps(data[i], ensure_ascii=False, indent=4)
                for k, v in data[i].iteritems():
                    if content[-1][k] != data[i][k]:
                        content[-1][k] += data[i][k]

        # done
        return content

Example #8

0

Show file

File: main.py Project: chunlaw/HKCourtArchive

 def parseHeader(self, table):
     if "案件編號" in str(table) or "案件號碼" in str(table):
         extractor = Extractor(str(table), transformer=unicode)
         extractor.parse()
         headerTable = extractor.return_list()
         for i in xrange(0, len(headerTable)):
             if (''.join(headerTable[i])).find(u'案件號碼') != -1 or (''.join(
                     headerTable[i])).find(u'案件編號') != -1:
                 self.headers = headerTable[i]
         self.headers = self.unique(self.headers)
         for i in xrange(0, len(self.headers)):
             self.headers[i] = re.sub('[A-Za-z]', '',
                                      self.headers[i]).strip()

Example #9

0

Show file

def get_gseb_results_12(f, seat_numbers, output_file_path, subjects):
    """

    :param f: The category of student in which he belongs which is metioned as first letter of seat number like='A' for std.10
    :param seat_numbers: The list of seat numbers of all stuents e.g.: [3513611,3136144,7724523]
    :param output_file_path: file address where the output result sheet is to be stored e.g. "D:/10_results/result.csv"
    :param subjects: the list of subjects e.g. ['English','Maths',...]
    :return: Nothing but csv is saved in the same folder
    """
    goto('http://www.gseb.org/')
    df = pd.DataFrame()
    temp = []
    for loop in seat_numbers:
        detail = {}
        driver.find_element_by_xpath(
            "//select[@name='drpInitChar']/option[text()='" + f +
            "']").click()
        username = driver.find_element_by_name("studentnumber")
        seat_no = loop
        detail['seat_no'] = seat_no
        username.send_keys(seat_no)
        go = driver.find_element_by_name("go")
        go.click()
        time.sleep(1)
        driver.switch_to.frame("marksheet")
        soup = BeautifulSoup(driver.page_source)
        table = soup.find("table", attrs={"class": "maintbl"})
        extractor = Extractor(table)
        extractor.parse()
        l = extractor.return_list()
        detail['Name'] = find(l[0][0], "Name: ")
        detail['result'] = find(l[2][0],
                                "Result: ").strip('School Index: 27.109')
        for i in range(len(subjects)):
            ttt = 5 - int(l[4 + i][0][-5:].find(
                re.findall(r"[A-Z]", l[4 + i][0][-5:])[0]))
            detail[subjects[i] + ' in total'] = l[4 + i][0][-5:-ttt]
            detail[subjects[i] + ' grade'] = l[4 + i][0][-ttt:]
        detail['Total'] = int(l[11][0][-3:])
        detail['Overall Grade'] = find(l[1][0], 'Grade: ')
        detail['Percentile Rank'] = find(
            l[1][0], 'Percentile: ').split()[0].strip('Grade:')
        temp.append(detail)
        driver.switch_to_default_content()
        driver.find_element_by_name("studentnumber").clear()
    df = df.append(temp, ignore_index=True)
    print(df)
    df.to_csv(output_file_path, index=False)
    driver.close()

Example #10

0

Show file

File: analyzer.py Project: edwardwufast/mycurrency

def get_low_currency_price_banks(currency):
    url = f'https://www.findrate.tw/{currency}/#.XRjSkVMkveR'
    result = requests.get(url)
    result.encoding = 'UTF-8'
    html = BeautifulSoup(result.text, 'html.parser')
    tables = html.findAll('table')
    extractor = Extractor(tables[1])
    extractor.parse()
    currency_table = extractor.return_list()
    columns = currency_table.pop(0)[1:]
    indices = [data.pop(0).rstrip() for data in currency_table]
    df = pandas.DataFrame(currency_table, columns=columns, index=indices)
    top8_cash_sell = df.sort_values(['現鈔賣出']).iloc[:8]
    top8_spot_sell = df.sort_values(['即期賣出']).iloc[:8]
    return (top8_cash_sell, top8_spot_sell)

Example #11

0

Show file

def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', nargs=1, default=None, help='URL')
    parser.add_argument('-f', nargs=1, default=None, help='HTML Content Input File')
    parser.add_argument('-o', nargs=1, default=None, help='Output File')
    parser.add_argument('-s', nargs=1, default=None, help='SKU Outline')
    parser.add_argument('-t', nargs=1, default=None, help='Which table in order of appearance to pull')

    args = parser.parse_args()

    url = args.u[0]
    sku = args.s[0]
    if url:
        xhtml = url_get_contents(url).decode('utf-8')
    else: 
        xhtml = readFile('sample.html')

    soup = BeautifulSoup(xhtml, 'lxml')
    tables = soup.find_all('table', attrs={"class":'chart'})
    cleanTables = []

    print("{} tables found".format(len(tables)))
    print("==== Table Information =====")
    for i, table in enumerate(tables):
        extractor = Extractor(table).parse()
        table = extractor.return_list()
        print("Table: {}    Rows: {}   Cols: {}".format(i, len(table), len(table[0])))
        cleanTables.append(table)

    try:
        tableIndex = int(args.t[0])
    except:
        tableIndex = int(input("Which table do you want to process with SKU: {}\n".format(sku)))

     
    products = processTable(cleanTables[tableIndex], sku)
    outputProductSkus(products)
    print('='*64) 
    print('{}Now without those pesky duplicates'.format(' '*16))
    print('='*64)

    products = combineWeightRanges(products)
    outputProductSkus(products)

Example #12

0

Show file

class TestExtractorTransformer(unittest.TestCase):
    def setUp(self):
        html = """
        <table>
            <tr>
              <td>1</td>
              <td>2</td>
            </tr>
            <tr>
              <td>3</td>
              <td>4</td>
            </tr>
        </table>
        """
        self.extractor = Extractor(html, transformer=int)
        self.extractor.parse()

    def test_config_transformer(self):
        self.assertEqual(self.extractor.return_list(), [[1, 2], [3, 4]])

Example #13

0

Show file

File: Data_FincaRaiz.py Project: RoTorresT/Chozas

    def data_table(s):
        """[summary]

        Args:
            s ([type]): [description]
        """
        table_doc = s.find_all('table')
        extractor = Extractor(table_doc[0])
        extractor = extractor.parse()
        tabla = extractor.return_list()
        tabla_columns = tabla[0]
        tabla_datos = tabla[1:]
        final = []
        for fila in tabla_datos:
            for element in fila:
                a = [w.strip() for w in fila]
                final.append(a)
        df = pd.DataFrame(final, columns=tabla_columns)
        return (df)

Example #14

0

Show file

class TestSimpleExtractor(unittest.TestCase):
    def setUp(self):
        html = """
        <table>
            <tr>
              <td>1</td>
              <td>2</td>
            </tr>
            <tr>
              <td>3</td>
              <td>4</td>
            </tr>
        </table>
        """
        self.extractor = Extractor(html)
        self.extractor.parse()

    def test_return_list(self):
        self.assertEqual(self.extractor.return_list(),
                         [[u'1', u'2'], [u'3', u'4']])

Example #15

0

Show file

    def _fetch_air_quality_routine(self, day: datetime):
        """
        Populate the air quality of the provinces
        Fetches data from `http://www.arpa.veneto.it/arpavinforma/bollettini/aria/aria_dati_validati_storico.php`

        :param day: The day of which the air quality wants to be known (instance of `~datetime`)
        """
        super()._fetch_air_quality_routine(day)

        for province in self.provinces:
            data = {
                'provincia': province.name.lower(),
                'giorno': day.strftime('%d'),
                'mese': day.strftime('%m'),
                'anno': day.strftime('%Y'),
                'Vai': 'Visualizza il bollettino'
            }

            response = requests.post(
                'http://www.arpa.veneto.it/arpavinforma/bollettini/aria/aria_dati_validati_storico.php',
                data=data)
            soup = BeautifulSoup(response.text, 'html.parser')
            table = soup.select_one('#ariadativalidati table')

            if table:
                extractor = Extractor(table)
                extractor.parse()
                table_data = extractor.return_list()[3:]

                province.quality.co = self.indicator_value(table_data, 'co')
                province.quality.so2 = self.indicator_value(table_data, 'so2')
                province.quality.no2 = self.indicator_value(table_data, 'no2')
                province.quality.o3 = self.indicator_value(table_data, 'o3')
                province.quality.pm10 = self.indicator_value(
                    table_data, 'pm10')
                province.quality.pm25 = self.indicator_value(
                    table_data, 'pm25')
                province.quality.c6h6 = self.indicator_value(
                    table_data, 'c6h6')

        if self.on_quality_fetched is not None: self.on_quality_fetched(self)

Example #16

0

Show file

def scores(indx):
    table = all_tables[indx]  # Grab the first table
    extractor = Extractor(table)
    extractor.parse()
    sias_list = extractor.return_list()

    sias_list = sias_list[:-3]
    sias_len = len(sias_list)
    sias_names = sias_list[sias_len - 1]
    sias_names = sias_names[1:]

    num_item = len(sias_names)

    sias_final = [[0] * num_item for _ in range(num_item)]

    for i in range(sias_len - 1):
        row = sias_list[i]
        row = row[1:]
        for j in range(len(row)):
            sias_final[i][j] = row[j]

    return sias_final

Example #17

0

Show file

File: html2xml2txt.py Project: Eskode/ARTEXT4LOD

def get_tables(tables):
    res = []
    for t in tables:
        labels = t.find_all('span', attrs={'class': 'label'})
        captions = t.find_all('span', attrs={'class': 'captions'})
        legend = t.find_all('p', attrs={'class': 'legend'})
        footnotes = t.find_all('dl', attrs={'class': 'footnotes'})
        for t2 in t.find_all('table'):
            extractor = Extractor(t2)
            extractor.parse()
            content = extractor.return_list()
            tab = {
                'label':
                labels[0].text if labels else '',
                'caption':
                captions[0].text.replace(labels[0].text, '')
                if captions else '',
                'legend': [x.text for x in legend] if legend else '',
                'footnote': [x.text for x in footnotes] if footnotes else '',
                'content':
                content
            }
            res.append(tab)
    return res

Example #18

0

Show file

def extract_table(href: str) -> list:
    # from given reference download all the tables = class .table and put them in the list, which is returned
    def get_name(html_local):  # the only way found to get the location name
        soup_local = BeautifulSoup(
            html_local, "html.parser"
        )  # I didnt know how to transfer the BeautifulSoup object in
        vse = soup_local.find_all('h3')
        for item in vse:
            a = item.text.split(':')
            if 'Obec' in a[0]:
                obec = a[1].strip(' "\n"')
        return obec

    # from given reference download all the tables = class .table and put them in the list, which is returned
    html = stahni_html(href)
    district_name = get_name(html)
    soup = BeautifulSoup(html, "html.parser")
    tables = []
    #using Extractor to get tables from the html code
    for a_elem in soup.select('.table'):
        extractor = Extractor(a_elem)
        extractor.parse()
        tables.append(extractor.return_list())
    return tables, district_name

Example #19

0

Show file

File: Table2xlsx.py Project: manasec/mmtscraper

    def html_table_converter(self, offer_table):

        extractor = Extractor(offer_table)
        extractor.parse()
        table_list = extractor.return_list()
        return table_list

Example #20

0

Show file

File: build_models.py Project: uburuntu/yaweather

def get_lists(soup: BeautifulSoup) -> List[List[str]]:
    extractor = Extractor(soup)
    extractor.parse()
    return extractor.return_list()

Example #21

0

Show file

File: app.py Project: ankit-bagde/Browser-for-blind

def destination():
    error = request.args['error']
    text = request.args['text']
    number_input = request.args['number_input']
    req = requests.get('https://www.google.com/search?q='+text, headers={'User-Agent': 'Mozilla/5.0'})
    req.raise_for_status()
    soup_doc = BeautifulSoup(req.text, 'html.parser')
    linkelements = soup_doc.select('.r a')
    page = Request(("https://google.com" + linkelements[int(number_input)-1].get('href')), headers={'User-Agent': 'Mozilla/5.0'})
    page_response = urlopen(page)
    url = "https://google.com" + linkelements[int(number_input)-1].get('href')
    html_read = page_response.read()
    soup = BeautifulSoup(html_read, 'html.parser')

    if error == '0' and request.method != 'POST':
        sample = 'Press enter to search for HTML attributes. 1 to navigate back, 2 to navigate to home page'
        myobj = gTTS(text=sample, lang='en', slow=False)
        myobj.save("text.mp3")
        os.system("mpg321 text.mp3")
   
    if request.method == 'POST':

        if request.form['btn'] == 'Home':
            return redirect(url_for('index', error = 1))

        if request.form['btn'] == 'Back':
            return redirect(url_for('link_number', text=text, error=1))

        if request.form['btn'] == 'search audio':
            if error == '0':    
                sample = 'Speak html attributes like title, paragraph, bold, links, tables, etcetra.'
                myobj = gTTS(text=sample, lang='en', slow=False)
                myobj.save("text1.mp3")
                os.system("mpg321 text1.mp3")
                error = 1

            r = sr.Recognizer()
            with sr.Microphone() as source:
                print("Say something or else say 'exit' to exit!")
                r.adjust_for_ambient_noise(source, duration = 1)
                audio = r.listen(source)
            try :
                text1 = r.recognize_google(audio)
                print(text1)

                if text1 == 'title':
                    sample = 'Title of the given page is '+soup.title.string
                    myobj = gTTS(text=sample, lang='en', slow=False)
                    myobj.save("text.mp3")
                    os.system("mpg321 text.mp3")
                elif text1 == 'bold' :
                    sample = 'Bold attributes of the given page are'
                    for x in soup.find_all('b'):
                        sample = sample +", "+ x.string
                    myobj = gTTS(text=sample, lang='en', slow=False) 
                    myobj.save("welcome.mp3") 
                    os.system("mpg321 welcome.mp3")
                elif text1 == 'links' :
                    sample = 'links to other pages are'
                    for x in soup.find_all('a'):
                        sample = sample +", "+ x.string
                    myobj = gTTS(text=sample, lang='en', slow=False) 
                    myobj.save("welcome.mp3") 
                    os.system("mpg321 welcome.mp3")
                elif text1 == 'para' :
                    sample = 'pragraphs of the given page is'
                    for x in soup.find_all('p'):
                        sample = sample +", "+ x.string
                    myobj = gTTS(text=sample, lang='en', slow=False) 
                    myobj.save("welcome.mp3") 
                    os.system("mpg321 welcome.mp3")
                elif text1 == 'tables' or text == 'table' :
                    sample = 'Tables of the given page are : '
                    extractor = Extractor(soup)
                    extractor.parse()
                    table_list = extractor.return_list()
                    print(len(table_list))
                    for rows in range(len(table_list)):
                        if rows >0 :
                            if len(table_list[rows]) != len(table_list[0]):
       
                                break
                            for columns in range(len(table_list[rows])):
                                sample += table_list[0][columns]+" is "+table_list[rows][columns]+", "
                    myobj = gTTS(text=sample, lang='en', slow=False) 
                    myobj.save("welcome.mp3") 
                    os.system("mpg321 welcome.mp3")  

                else:
                    sample = 'Unable to recognize HTML attribute, press enter and speak again'
                    myobj = gTTS(text=sample, lang='en', slow=False)
                    myobj.save("text.mp3")
                    os.system("mpg321 text.mp3")
                    return redirect(url_for('destination', text=text, number_input=number_input, error=1))


            except :
                sample = 'Unable to recognize you! , press enter and speak again'
                myobj = gTTS(text=sample, lang='en', slow=False)
                myobj.save("text.mp3")
                os.system("mpg321 text.mp3")
                return redirect(url_for('destination', text=text, number_input=number_input, error=1))
    return render_template('destination.html', url=url)

Example #22

0

Show file

File: emiliaromagna.py Project: n28div/itAIRQ

    def _fetch_air_quality_routine(self, day: datetime):
        """
        Populate the air quality of the provinces.
        Data is fetched from https://www.arpae.it/qualita-aria/bollettino-qa/{date} where {date}
        is the date of interest in the format YYYYMMDD
        
        :param day: The day of which the air quality wants to be known (instance of `~datetime`)
        """
        super()._fetch_air_quality_routine(day)

        date_fmt = day.strftime('%Y%m%d')
        res = requests.get(
            f'https://www.arpae.it/qualita-aria/bollettino-qa/{date_fmt}')

        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'html.parser')
            table_rows = '\n'.join(
                [str(x) for x in soup.select('.tabella table tbody tr')])
            big_table = f'<table>{table_rows}</table>'
            extractor = Extractor(big_table)
            extractor.parse()
            table = extractor.return_list()

            for province in self.provinces:
                province_rows = [
                    x for x in table if x[0] == province.short_name
                ]

                so2 = [
                    self.extract_float(x[9]) for x in province_rows
                    if self.extract_float(x[9]) is not None
                ]
                no2 = [
                    self.extract_float(x[4]) for x in province_rows
                    if self.extract_float(x[4]) is not None
                ]
                co = [
                    self.extract_float(x[8]) for x in province_rows
                    if self.extract_float(x[8]) is not None
                ]
                pm10 = [
                    self.extract_float(x[2]) for x in province_rows
                    if self.extract_float(x[2]) is not None
                ]
                pm25 = [
                    self.extract_float(x[3]) for x in province_rows
                    if self.extract_float(x[3]) is not None
                ]
                o3 = [
                    self.extract_float(x[6]) for x in province_rows
                    if self.extract_float(x[6]) is not None
                ]
                c6h6 = [
                    self.extract_float(x[7]) for x in province_rows
                    if self.extract_float(x[7]) is not None
                ]

                if len(so2) > 0: province.quality.so2 = round(mean(so2), 2)
                if len(no2) > 0: province.quality.no2 = round(mean(no2), 2)
                if len(co) > 0: province.quality.co = round(mean(co), 2)
                if len(pm10) > 0: province.quality.pm10 = round(mean(pm10), 2)
                if len(pm25) > 0: province.quality.pm25 = round(mean(pm25), 2)
                if len(o3) > 0: province.quality.o3 = round(mean(o3), 2)
                if len(c6h6) > 0: province.quality.c6h6 = round(mean(c6h6), 2)

        if self.on_quality_fetched is not None: self.on_quality_fetched(self)

Example #23

0

Show file

def read_shifts_from_html_pages(rawtoi1, rawtoi2, teamid1, teamid2, season,
                                game):
    """
    Aggregates information from two html pages given into a dataframe with one row per second and one col per player.

    :param rawtoi1: str, html page of shift log for team id1
    :param rawtoi2: str, html page of shift log for teamid2
    :param teamid1: int, team id corresponding to rawtoi1
    :param teamid2: int, team id corresponding to rawtoi1
    :param season: int, the season
    :param game: int, the game

    :return: dataframe
    """

    from html_table_extractor.extractor import Extractor
    dflst = []
    for rawtoi, teamid in zip((rawtoi1, rawtoi2), (teamid1, teamid2)):
        extractor = Extractor(rawtoi)
        extractor.parse()
        tables = extractor.return_list()

        ids = []
        periods = []
        starts = []
        ends = []
        durationtime = []
        teams = []
        i = 0
        while i < len(tables):
            # A convenient artefact of this package: search for [p, p, p, p, p, p, p, p]
            if len(tables[i]) == 8 and helpers.check_number_last_first_format(
                    tables[i][0]):
                pname = helpers.remove_leading_number(tables[i][0])
                pname = helpers.flip_first_last(pname)
                pid = players.player_as_id(pname)
                i += 2  # skip the header row
                while re.match('\d{1,2}',
                               tables[i][0]):  # First entry is shift number
                    # print(tables[i])
                    shiftnum, per, start, end, dur, ev = tables[i]
                    # print(pname, pid, shiftnum, per, start, end)
                    ids.append(pid)
                    periods.append(int(per))
                    starts.append(start[:start.index('/')].strip())
                    ends.append(end[:end.index('/')].strip())
                    durationtime.append(helpers.mmss_to_secs(dur))
                    teams.append(teamid)
                    i += 1
                i += 1
            else:
                i += 1

        startmin = [x[:x.index(':')] for x in starts]
        startsec = [x[x.index(':') + 1:] for x in starts]
        starttimes = [
            1200 * (p - 1) + 60 * int(m) + int(s) + 1
            for p, m, s in zip(periods, startmin, startsec)
        ]
        # starttimes = [0 if x == 1 else x for x in starttimes]
        endmin = [x[:x.index(':')] for x in ends]
        endsec = [x[x.index(':') + 1:] for x in ends]
        # There is an extra -1 in endtimes to avoid overlapping start/end
        endtimes = [
            1200 * (p - 1) + 60 * int(m) + int(s)
            for p, m, s in zip(periods, endmin, endsec)
        ]

        durationtime = [e - s for s, e in zip(starttimes, endtimes)]

        df = pd.DataFrame({
            'PlayerID': ids,
            'Period': periods,
            'Start': starttimes,
            'End': endtimes,
            'Team': teams,
            'Duration': durationtime
        })
        dflst.append(df)

    return _finish_toidf_manipulations(pd.concat(dflst), season, game)

Example #24

0

Show file

File: apicast.py Project: thiasB/apicast

def parse_html_table(html):
    extractor = Extractor(html)
    extractor.parse()
    return extractor.return_list()

Example #25

0

Show file

File: liguria.py Project: n28div/itAIRQ

    def set_indicator_value(self, day: datetime, indicator: str) -> float:
        """
        Populates the indicator specified in the provinces
        fetched data from `http://www.cartografiarl.regione.liguria.it/SiraQualAria/script/Pub3AccessoDatiAria.asp?Tipo=DatiGiorno`

        :param day: The day of interest
        :param indicator: The indicator of interest
        """
        if indicator not in self.indicator_map: return    
        
        data = {
            'Giorni': day.strftime('%d'),
            'Mesi': day.strftime('%m'),
            'Anni': day.strftime('%Y'),
            'TipoTema': 'SENSORI',
            'Tipo': 'DatiGiorno',
            'Anno': day.strftime('%Y'),
            'Mese': day.strftime('%m'),
            'Giorno': day.strftime('%d'),
            'DataIniz': day.strftime('%d/%m/%Y'),
            'CodTema': 'SENSORI'
        }

        res = requests.post('http://www.cartografiarl.regione.liguria.it/SiraQualAria/script/Pub3AccessoDatiAria13.asp',
                            data=data)
        
        soup = BeautifulSoup(res.text, 'html.parser')
        # a unique needs to be provided when a request is made, it is sent to the user in form of an hidden field
        try:
            id_richiesta = soup.find_all('input', {'name': 'Id_Richiesta'})[0]['value']
        except:
            # data for the selected day not available
            return

        map_data = self.indicator_map[indicator]
        res = requests.get('http://www.cartografiarl.regione.liguria.it/SiraQualAria/script/Pub3AccessoDatiAria131.asp',
            params = (
                ('Anno', day.strftime('%Y')),
                ('CodParam', map_data['CodParam']),
                ('SiglaParam', map_data['SiglaParam']),
                ('Azione', 'LISTA_STAZIONI'),
                ('CodTema', 'SENSORI'),
                ('DataIniz', day.strftime('%d/%m/%Y')),
                ('Id_Richiesta', id_richiesta)
            )
        )

        t = '</TR><TR>'.join(res.text.split('</TR>'))
        soup = BeautifulSoup(t, 'html.parser')
        table = soup.select('table')[0]

        extractor = Extractor(table)
        extractor.parse()
        # remove header
        table_data = extractor.return_list()[1:]
        
        if len(table_data) > 0:
            # remove any row after the first blank
            table_data = table_data[:next(idx for idx, y in enumerate(table_data) if len(y) == 0)]

            for province in self.provinces:
                values = list()
                for x in table_data:
                    if province.short_name in x[1]:
                        try:
                            values.append(float(x[map_data['table_idx']].strip()))
                        except:
                            pass 
                
                if len(values) != 0:
                    setattr(province.quality, indicator, round(float(sum(values) / len(values)), 2))

Example #26

0

Show file

File: umbria.py Project: n28div/itAIRQ

    def _fetch_air_quality_routine(self, day: datetime):
        """
        Populate the air quality of the provinces.
        Data is fetched from http://www.arpa.umbria.it/monitoraggi/aria/Default.aspx
        
        :param day: The day of which the air quality wants to be known (instance of `~datetime`)
        """
        super()._fetch_air_quality_routine(day)

        date_fmt = day.strftime('%d/%m/%Y')
        data = {
            '__EVENTTARGET': 'ctl00$Content$txtData',
            '__EVENTARGUMENT': '',
            '__LASTFOCUS': '',
            '__VIEWSTATE':
            '/wEPDwUKMTUzNjEyNDUzNw9kFgJmD2QWAgIBD2QWAmYPZBYEAgsPZBYEAgEPFgIeC18hSXRlbUNvdW50AgMWBmYPZBYEAgEPDxYCHgdWaXNpYmxlaGQWAmYPFQEIMDkvMDQvMThkAgIPFQEZJm5ic3A7PC9wPg0KPHA+Jm5ic3A7PC9wPmQCAQ9kFgQCAQ9kFgJmDxUBCDA1LzA1LzE5ZAICDxUBwgFOZWxsYSBnaW9ybmF0YSBvZGllcm5hIGNpIHNvbm8gc3RhdGUgZGVsbGUgZGlmZmljb2x0JmFncmF2ZTsgdGVjbmljaGUgaW4gbWVyaXRvIGFsbGEgcHViYmxpY2F6aW9uZSBhdXRvbWF0aWNhIGRlaSBkYXRpIGRpIHNhYmF0byA0LiBMJ2luY29udmVuaWVudGUgdmVyciZhZ3JhdmU7IHJpc29sdG8gYWwgcGkmdWdyYXZlOyBwcmVzdG8uPC9wPmQCAg9kFgQCAQ9kFgJmDxUBCDE5LzAyLzE5ZAICDxUBhwM8c3Ryb25nPk1hbnV0ZW56aW9uZSBzdHJ1bWVudGF6aW9uZSAyMDE5PC9zdHJvbmc+PGJyIC8+RGFsIDE4IGZlYmJyYWlvIGFsIHByaW1vIG1hcnpvIHNvbm8gcHJldmlzdGUgbGUgb3BlcmF6aW9uaSBkaSBtYW51dGVuemlvbmUgcGVyaW9kaWNoZSAoYW5udWFsaSkgZGVsbGEgc3RydW1lbnRhemlvbmUgaW5zdGFsbGF0YSBuZWxsYSByZXRlIGRpIG1vbml0b3JhZ2dpby4gUGVyIHF1ZXN0byBtb3Rpdm8gcG90cmViYmVybyB2ZXJpZmljYXJzaSBkZWxsZSBpbnRlcnJ1emlvbmkgbmVsIHJpbGV2YW1lbnRvIGRlaSBkYXRpIHJlbGF0aXZpIGFnbGkgc3RydW1lbnRpIGluIG1hbnV0ZW56aW9uZS4mbmJzcDs8L3A+DQo8cD4mbmJzcDs8L3A+DQo8cD4mbmJzcDs8L3A+DQo8cD4mbmJzcDs8L3A+ZAIDDw8WBB4LUG9zdEJhY2tVcmwFK2FyY2hpdmlvTm90aXppZS5hc3B4P2NvZGljZVBhZ2luYT1SUk0mem9uYT0fAWdkZAIPD2QWAmYPZBYCAgEPEA8WBh4NRGF0YVRleHRGaWVsZAUETm9tZR4ORGF0YVZhbHVlRmllbGQFAklkHgtfIURhdGFCb3VuZGdkEBUPGVBlcnVnaWEgLSBQYXJjbyBDb3J0b25lc2UcUGVydWdpYSAtIFBvbnRlIFNhbiBHaW92YW5uaRRQZXJ1Z2lhIC0gRm9udGl2ZWdnZSBDaXR0w6AgZGkgQ2FzdGVsbG8gLSBDLiBDYXN0ZWxsbxpHdWJiaW8gLSBQaWF6emEgNDAgTWFydGlyaRFNYWdpb25lIC0gTWFnaW9uZRZGb2xpZ25vIC0gUG9ydGEgUm9tYW5hEFRvcmdpYW5vIC0gQnJ1ZmEZU3BvbGV0byAtIFBpYXp6YSBWaXR0b3JpYRJUZXJuaSAtIEJvcmdvIFJpdm8PVGVybmkgLSBDYXJyYXJhEVRlcm5pIC0gTGUgR3JhemllD0FtZWxpYSAtIEFtZWxpYRNOYXJuaSAtIE5hcm5pIFNjYWxvE09ydmlldG8gLSBDaWNvbmlhIDIVDwMzXzEDM18yBDNfNjkDM183AzNfMwMzXzYDM180AzNfNQUzXzIwNQM3XzEDN18yAzdfMwM3XzUDN180AzdfNhQrAw9nZ2dnZ2dnZ2dnZ2dnZ2dkZGT1g28Bzs2KuJM0nGhoW/nLrR4W/HpnjtjYCY1FCtl6eA==',
            '__VIEWSTATEGENERATOR': 'A373F38E',
            '__PREVIOUSPAGE':
            '5rDzdOLdhSojgNkWU0aySKgUcCP-WXzqaXaRNPbAb-Ekcs1vVl_yJf9liwnKWXEk15jl_Z8YIAJ86zswapmkHfDz2MMg9vQnDDQypfObingUmLuVVTMztw73FN9-55lI0',
            '__EVENTVALIDATION':
            '/wEdABshO2HSLC4Irl9HO+xCVg8wb8C3weGBaOLrENr46Y99cTPW5fmNeTa451MZa8LXyblcbg/Uqmez9yXP+xSTfXC/S9OqRU0oWDv+cbRkqcKtAqcsJFHEnZTzh0X+kVeLa7e4rr9jBld/uVqJpfp464tKRYmvyX4i1bjLFIfxIkw0G+o0YQNlnq4u76x5pwotKnDgEO4xErwMzPYvPwScdqOGIUgWeFC3y966dlr8RsY+JYzWFz2lgCufNhmaoE94Y/QiRS7TDGhtA/xOb3OYxEB522qpZQfWwl21Nv1xVarGgMm6hUuJGOA6Q4Ko1E4M+sQ9CZ53jxit2DF58lu5QFtr6x1PlqI+jgkEbNYTNUujYRbbFs2N4TjG5zEZ4xduFBkrD27kcj09V7bJX/igStyEnNJs5SuXPSKM2cTNsffB6XcH17ma9zwqai6CNsf9Og0ZPzjdX2zFoASErgXLJvie8NzsH8t7duXHZk9hbS9Vs21a/4yX1BpSDSioiW1gxr+tUHjFeS1m0yjnOD9kwBYX4jCmBywb7GNFZX8+9J5ux+74SyM4niEhJdJF38T+LG4OdFP/T/wCCiwNou/IvjveW95PGaK16TIOdZz/XYSt3Q==',
            'ctl00$Content$txtData': date_fmt,
            'ctl00$Content$Grafico1$cboStazioni': '3_1',
            'ctl00$Content$Grafico1$cboInquinante': 'SO224H'
        }
        res = requests.post(
            'http://www.arpa.umbria.it/monitoraggi/aria/Default.aspx',
            data=data)

        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'html.parser')

            html_table = soup.select_one('#ctl00_Content_TabellaDati')
            extractor = Extractor(html_table)
            extractor.parse()
            table = extractor.return_list()[2:]

            html_table = soup.select_one(
                '#ctl00_Content_TabellaDatiAltreStazioni')
            extractor = Extractor(html_table)
            extractor.parse()
            table.extend(extractor.return_list()[2:])

            for province in self.provinces:
                province_rows = [
                    x for x in table
                    if x[0].split(' - ')[0].lower() == province.name.lower()
                ]

                so2 = [
                    self.extract_float(x[1]) for x in province_rows
                    if self.extract_float(x[1]) is not None
                ]
                no2 = [
                    self.extract_float(x[3]) for x in province_rows
                    if self.extract_float(x[3]) is not None
                ]
                co = [
                    self.extract_float(x[4]) for x in province_rows
                    if self.extract_float(x[4]) is not None
                ]
                pm10 = [
                    self.extract_float(x[7]) for x in province_rows
                    if self.extract_float(x[7]) is not None
                ]
                pm25 = [
                    self.extract_float(x[9]) for x in province_rows
                    if self.extract_float(x[9]) is not None
                ]
                o3 = [
                    self.extract_float(x[5]) for x in province_rows
                    if self.extract_float(x[5]) is not None
                ]
                c6h6 = [
                    self.extract_float(x[7]) for x in province_rows
                    if self.extract_float(x[7]) is not None
                ]

                if len(so2) > 0: province.quality.so2 = round(mean(so2), 2)
                if len(no2) > 0: province.quality.no2 = round(mean(no2), 2)
                if len(co) > 0: province.quality.co = round(mean(co), 2)
                if len(pm10) > 0: province.quality.pm10 = round(mean(pm10), 2)
                if len(pm25) > 0: province.quality.pm25 = round(mean(pm25), 2)
                if len(o3) > 0: province.quality.o3 = round(mean(o3), 2)
                if len(c6h6) > 0: province.quality.c6h6 = round(mean(c6h6), 2)

        if self.on_quality_fetched is not None: self.on_quality_fetched(self)

Example #27

0

Show file

File: utils.py Project: ocwc/ocwc-members

def sync_conference(conf_id=3):
    conf = ConferenceInterface.objects.get(pk=conf_id)

    route = "forms/1/entries"
    expires = arrow.utcnow().replace(minutes=+10).timestamp

    string_to_sign = str("{}:{}:{}:{}".format(conf.api_key, "GET", route, expires))
    sig = _calculate_signature(string_to_sign, conf.private_key)
    req = requests.get(
        conf.url, params={"api_key": conf.api_key, "signature": sig, "expires": expires}
    )

    data = json.loads(req.content)

    for entry in data.get("response", {}).get("entries", []):
        html = requests.get(
            "{}/wp-json/conference/v1/entry/{}/{}".format(
                settings.WP_URL, entry.get("form_id"), entry.get("id")
            ),
            auth=(settings.WP_BASIC_AUTH_USER, settings.WP_BASIC_AUTH_PASS),
        )

        html = html.json().get("html")

        table = BeautifulSoup(html, "html.parser").find_all(
            "table", class_="entry-products"
        )
        extractor = Extractor(table[0], transformer=unicode)
        extractor.parse()
        table_data = extractor.return_list()

        products = []
        total_amount = ""
        for item in table_data:
            product_name, amount, price, total = item

            if price == "Total":
                total_amount = total
                continue

            product_name = product_name.strip()
            amount = int(amount)
            price = float(price.replace(u"$", "").strip().replace(",", "."))

            products.append({"name": product_name, "amount": amount, "price": price})

        doc = pq(html)
        billing_html = ""
        for val in doc(".entry-view-field-name"):
            if val.text == "Billing address details":
                doc(val).parents("tr").next_all().find("a").remove()
                billing_html = doc(val).parents("tr").next_all().find("td").html()
                if entry.get("34"):
                    billing_html += "<br/>" + entry.get("34")

        billing_html = (
            billing_html.replace("<br/>", "\n").replace("<p>", "").replace("</p>", "")
        )

        if "wire" in entry.get("21"):
            payment_type = "wire"
        else:
            payment_type = "group"

        registration, is_created = ConferenceRegistration.objects.get_or_create(
            interface=conf,
            form_id=entry.get("form_id"),
            entry_id=entry.get("id"),
            defaults={
                "ticket_type": entry.get("9"),
                "payment_type": payment_type,
                "source_url": entry.get("source_url"),
                "entry_created": arrow.get(
                    entry.get("date_created").replace(" ", "T")
                ).datetime,
                "name": u"{} {}".format(entry.get("1.3"), entry.get("1.6")),
                "email": entry.get("2"),
                "organization": entry.get("6"),
                "billing_html": billing_html,
                "total_amount": total_amount,
                "products": products,
            },
        )

Example #28

0

Show file

File: scrape_hokenjo.py Project: underspecified/covid19-hotlines-jp

def table2lists(table):
    extractor = Extractor(table)
    extractor.parse()
    return extractor.return_list()

Example #29

0

Show file

def sync_conference(conf_id=3):
    conf = ConferenceInterface.objects.get(pk=conf_id)

    route = "forms/1/entries"
    expires = arrow.utcnow().replace(minute=+10).timestamp

    string_to_sign = str("{}:{}:{}:{}".format(conf.api_key, "GET", route,
                                              expires))
    sig = _calculate_signature(string_to_sign, conf.private_key)
    req = requests.get(
        conf.url,
        params={
            "api_key": conf.api_key,
            "signature": sig,
            "expires": expires,
            "paging[page_size]": 20,
        },
    )

    data = json.loads(req.content)

    for entry in data.get("response", {}).get("entries", []):
        if entry["status"] == "trash":
            continue

        html = requests.get(
            "{}/wp-json/conference/v1/entry/{}/{}".format(
                settings.WP_URL, entry.get("form_id"), entry.get("id")),
            auth=(settings.WP_BASIC_AUTH_USER, settings.WP_BASIC_AUTH_PASS),
        )

        html = html.json().get("html")

        table = BeautifulSoup(html,
                              "html.parser").find_all("table",
                                                      class_="entry-products")
        extractor = Extractor(table[0], transformer=unicode)
        extractor.parse()
        table_data = extractor.return_list()

        products = []
        total_amount = ""
        for item in table_data:
            product_name, amount, price, total = item

            if price == "Total":
                total_amount = total
                continue

            product_name = product_name.strip()
            amount = int(amount)
            price = float(price.replace(u"$", "").strip().replace(",", "."))

            products.append({
                "name": product_name,
                "amount": amount,
                "price": price
            })

        doc = pq(html)
        billing_html = ""
        for val in doc(".entry-view-field-name"):
            if val.text == "Billing address details":
                doc(val).parents("tr").next_all().find("a").remove()
                billing_html = doc(val).parents("tr").next_all().find(
                    "td").html()
                if entry.get("34"):
                    billing_html += "<br/>" + entry.get("34")

        billing_html = (billing_html.replace("<br/>",
                                             "\n").replace("<p>", "").replace(
                                                 "</p>", ""))

        if "wire" in entry.get("21"):
            payment_type = "wire"
        else:
            payment_type = "group"

        if entry.get("53"):
            is_group = True
        else:
            is_group = False

        registration, is_created = ConferenceRegistration.objects.get_or_create(
            interface=conf,
            form_id=entry.get("form_id"),
            entry_id=entry.get("id"),
            defaults={
                "ticket_type":
                entry.get("9"),
                "payment_type":
                payment_type,
                "source_url":
                entry.get("source_url"),
                "entry_created":
                arrow.get(entry.get("date_created").replace(" ",
                                                            "T")).datetime,
                "name":
                u"{} {}".format(entry.get("1.3"), entry.get("1.6")),
                "email":
                entry.get("2"),
                "organization":
                entry.get("6"),
                "billing_html":
                billing_html,
                "total_amount":
                total_amount,
                "products":
                products,
                "is_group":
                is_group,
            },
        )