Python WebParserの例、web_parser.WebParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: renfe.py プロジェクト: helcerion/Renfe

def schedule(destination, now):
    now = int(now) - 1
    later = int(now) + 1
    day = time.strftime("%d",  time.localtime())
    month = time.strftime("%m",  time.localtime())
    year = time.strftime("%Y",  time.localtime())
    # pl catalunya:78805, mataro:79500, sants:71801, sant adria:79403, (horariDesde, horariFins + nomes hora)
    values = {'day': day, 'month': month, 'year': year, 'sourceCode': '79500', 'destinationCode': destination,
              'fromtime': now, 'totime': later}

    data = urlencode(values)
    url = "http://www14.gencat.cat/mobi_rodalies/AppJava/pages/horaris/ResultatCerca.htm"

    page = get_page(url, data)

    p = WebParser()
    p.feed(page)

    timetables = p.dom.get_element_by_id('timetablesTable')
    schedules = timetables.get_elements_by_tag('li')

    time_table = {}

    for schedule_item in schedules:
        item_id = schedule_item.get_id()
        departure = schedule_item.get_elements_by_class('departureTime')
        arrival = schedule_item.get_elements_by_class('arrivalTime')
        triptime = schedule_item.get_element_by_id('tripTimeText')
        if departure != [] or arrival != []:
            time_table[item_id] = {'departureTime': departure[0].get_text(), 'arrivalTime': arrival[0].get_text(),
                                   'tripTime': triptime.get_text()}
    return time_table

コード例 #2

0

ファイルを表示

ファイル: input_handler.py プロジェクト: ZaoLahma/py_devrant

 def __init__(self):
     self.gui = GUI()
     self.web_parser = WebParser("www.devrant.io")
     self.executed_commands = []
     self.page = 0
     self.limit = 20
     self.sort = "recent"
     self.state = "NO_STATE"

コード例 #3

0

ファイルを表示

ファイル: renfe.py プロジェクト: helcerion/Renfe

def stations():
    values = {'lineId': 'R1'}
    data = urlencode(values)
    url = "http://www14.gencat.cat/mobi_rodalies/AppJava/pages/linies/Detall.htm"
    page = get_page(url, data)
    p = WebParser()
    p.feed(page)
    station_list = p.dom.get_elements_by_class('RodaliesList')[0]
    station_list = station_list.get_elements_by_tag('li')
    result = {}
    for sl in station_list:
        station_id = sl.get_id()[9:]
        station_name_elem = sl.get_elements_by_class('stationName')[0]
        station_name_elem = station_name_elem.get_elements_by_tag('xml-fragment')[0]
        station_name = station_name_elem.get_text()
        result[station_id] = station_name
    return result

コード例 #4

0

ファイルを表示

    def collect_links_and_data(self, page_url):

        # Fixes ssl issue for some mac users
        if (not os.environ.get('PYTHONHTTPSVERIFY', '')
                and getattr(ssl, '_create_unverified_context', None)):
            ssl._create_default_https_context = ssl._create_unverified_context
        try:
            html_string = ""
            response = urlopen(page_url)
            if "text/html" in response.getheader(
                    "Content-Type"):  # Check to see if HTML response
                html_bytes = response.read()  # Read the bytestream in response
                html_string = html_bytes.decode(
                    "utf-8")  # Decode bytestream as utf-8

            parser = WebParser(
                self.base_url
            )  # Initialise custom webparser with html response
            parser.feed(html_string)  # Execute parser
            self.data_list = parser.get_data_with_tags(
            )  # Retrieve datalist from parser
        except Exception as e:
            print("Error: " + str(e))
            print("Program will terminate")
            sys.exit()
        return parser.get_page_urls()

コード例 #5

0

ファイルを表示

ファイル: scraper.py プロジェクト: Wulff-1996/web_scraper_python_exam

    def collect_links_and_data(self, page_url):
        try:
            html_string = ""
            response = urlopen(page_url)
            if "text/html" in response.getheader(
                    "Content-Type"):  # Check to see if HTML response
                html_bytes = response.read()  # Read the bytestream in response
                html_string = html_bytes.decode(
                    "utf-8")  # Decode bytestream as utf-8

            parser = WebParser(
                self.base_url
            )  # Initialise custom webparser with html response
            parser.feed(html_string)  # Execute parser
            self.data_list = parser.get_data_with_tags(
            )  # Retrieve datalist from parser
        except Exception as e:
            print("Error: " + str(e))
            print("Program will terminate")
            sys.exit()
        return parser.get_page_urls()

コード例 #6

0

ファイルを表示

ファイル: main.py プロジェクト: DimDimi4-and-fixiki/avito_task

from fastapi import FastAPI
from fastapi import BackgroundTasks
from models import AvitoPair
from db_handler import DataBaseHandler
from web_parser import WebParser
import datetime
import asyncio

app = FastAPI()  # API app

data_base_handler = DataBaseHandler(
    path="Avito")  # handles all database operations
web_parser = WebParser()  # handles all operations with the web site scrapping
loop = asyncio.get_event_loop()  # event loop of asyncio


async def update_timestamps(pair: AvitoPair):
    """
    Updates all information about timestamps and counters for the pair
    Updates information about top posts for the pair
    :param pair: pair of phrase and region
    """
    while True:
        # adds data about timestamps in the database:
        add_timestamp(pair=pair)

        # Adds top 5 links for the pair in the database
        add_top_posts(pair=pair)
        await asyncio.sleep(3600)  # sleeps for an hour

コード例 #7

0

ファイルを表示

ファイル: input_handler.py プロジェクト: ZaoLahma/py_devrant

class InputHandler:
    __ADDRESS_BASE = "/api/devrant/"

    def __init__(self):
        self.gui = GUI()
        self.web_parser = WebParser("www.devrant.io")
        self.executed_commands = []
        self.page = 0
        self.limit = 20
        self.sort = "recent"
        self.state = "NO_STATE"

    def handle_input(self):
        running = True
        while (running == True):
            user_input = self.gui.show_menu("NO_STATE")
            if 'exit' == user_input:
                return
            command = self.get_command(user_input)
            if command[0] == True:
                if 'r' != user_input and 'b' != user_input and 'n' != user_input:
                    self.executed_commands.append(user_input)
                rants = self.web_parser.execute_command(command[1], command[2])
                if None != rants:
                    self.gui.print_rants(rants,
                                         int((self.page / self.limit) + 1))

    def get_command(self, user_input):
        retval = [False, None, None]
        if 'r' == user_input:
            if len(self.executed_commands) > 0:
                return self.get_command(self.executed_commands[-1])
        if 'b' == user_input:
            if len(self.executed_commands) > 0:
                if self.page - self.limit >= 0 and True == self.__pageState(
                        self.state):
                    self.page -= self.limit
                elif len(self.executed_commands) > 1:
                    self.executed_commands.pop()
                return self.get_command(self.executed_commands[-1])
        if 'n' == user_input:
            if len(self.executed_commands) > 0:
                self.page += self.limit
                return self.get_command(self.executed_commands[-1])
        if 'surprise' == user_input:
            self.__set_state("SURPRISE")
            retval[1] = InputHandler.__ADDRESS_BASE + "rants/" + user_input
            retval[2] = {'app': 3}
            retval[0] = True
        res = re.match('sort(\s)(algo|recent)', user_input)
        if res:
            self.sort = res.group(2)
            self.page = 0
        res = re.match('view(\s)(\d+)', user_input)
        if res:
            self.__set_state("VIEW")
            self.limit = int(res.group(2))
            retval[1] = InputHandler.__ADDRESS_BASE + "rants"
            retval[2] = {
                'app': 3,
                'sort': self.sort,
                'limit': self.limit,
                'skip': self.page
            }
            retval[0] = True
        else:
            res = re.match('view', user_input)
            if res:
                self.__set_state("VIEW")
                self.limit = 20
                retval[1] = InputHandler.__ADDRESS_BASE + "rants"
                retval[2] = {
                    'app': 3,
                    'sort': self.sort,
                    'limit': self.limit,
                    'skip': self.page
                }
                retval[0] = True
        res = re.match('search(\s)(.*)', user_input)
        if res:
            self.__set_state("SEARCH")
            retval[1] = InputHandler.__ADDRESS_BASE + "search"
            retval[2] = {'app': 3, 'term': res.group(2), 'skip': self.page}
            retval[0] = True
        res = re.match('top(\s)(\d+)', user_input)
        if res:
            self.__set_state("TOP")
            self.limit = int(res.group(2))
            retval[1] = InputHandler.__ADDRESS_BASE + "rants"
            retval[2] = {
                'app': 3,
                'sort': 'top',
                'limit': self.limit,
                'skip': self.page
            }
            retval[0] = True
        res = re.match('get(\s)(\d+)', user_input)
        if res:
            self.__set_state("GET")
            retval[1] = InputHandler.__ADDRESS_BASE + "rants/" + res.group(2)
            retval[2] = {'app': 3}
            retval[0] = True
        return retval

    def __set_state(self, state):
        if self.state != state:
            if self.__pageState(state) and self.__pageState(self.state):
                self.page = 0
                self.limit = 20
        self.state = state

    def __pageState(self, state):
        if state == "VIEW" or state == "TOP" or state == "SEARCH":
            return True

        return False

コード例 #8

0

ファイルを表示

ファイル: lims_downloader.py プロジェクト: AAFC-BICoE/nrc-ngs-downloader

def main():
    #check if there is another instance
    pid_file = 'program.pid'
    fp = open(pid_file, 'w')
    try:
        fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
    except IOError:
        sys.exit('Another instance is running')

    # get settings from cinfig.ini.sample file
    time_format = "%a %b %d %H:%M:%S %Y"
    action_info = {}
    start = datetime.now()
    action_info['start_time'] = start.strftime(time_format)
    action_info['machine_ip'] = socket.gethostbyname(socket.gethostname())
    action_info['directory_name'] = os.getcwd()
    link = ' '
    command_line = link.join(sys.argv)
    action_info['command_line'] = command_line
    try:
        action_info['version'] = pkg_resources.get_distribution(
            'nrc_ngs_dl').version
    except:
        pass
    try:
        args = parse_input_args(sys.argv[1:])
    except:
        sys.exit('Usage: lims_downloader -c /path/to/configuation.ini')

    if not args.config_file:
        sys.exit('Usage: lims_downloader -c /path/to/configuation.ini')

    config_file = args.config_file
    if os.path.exists(config_file) == False:
        sys.exit('Error: config_file %s not exist' % config_file)
    try:
        config_setting = ConfigSetting(config_file)
    except IOError:
        sys.exit('Cannot open file:' + config_file +
                 '; Cannot get the configuration settings')

    #set up logging
    try:
        set_up_logging(config_setting.log_file_name, config_setting.log_name,
                       config_setting.log_level)
    except:
        sys.exit('Cannot locate the log file ' + config_setting.log_file_name)

    logger = logging.getLogger('nrc_ngs_dl.lims_downloader')

    if os.path.exists(config_setting.destination_folder) == False:
        logger.info('folder %s not exist, create the folder' %
                    config_setting.destination_folder)
        try:
            os.makedirs(config_setting.destination_folder)
        except:
            logger.error('Cannot create the destination folder %' %
                         config_setting.destination_folder)
            sys.exit(1)

    if os.access(config_setting.destination_folder, os.R_OK
                 or os.W_OK) == False:
        logger.error('Do not have permission to access the %s' %
                     config_setting.destination_folder)
        sys.exit(1)
    #connect to database if the database exist
    #otherwise create tables for this database
    try:
        lims_database = LimsDatabase(config_setting.db_name)
    except:
        #if lims_database is None:
        logger.error('Cannot access the database %s' % config_setting.db_name)
        sys.exit('Cannot access the database ' + config_setting.db_name)

    action_id = lims_database.insert_action_info(action_info)
    #login to LIMS webpage
    try:
        logger.info('Logging into NRC-LIMS web page ')
        web_parser = WebParser(config_setting.login_url,
                               config_setting.runlist_url,
                               config_setting.username,
                               config_setting.password)
    except:
        logger.error('Failed to log in')
        sys.exit(1)
    #get a list of all the completed sequence runs
    #information for each run : url_for_the_run, run_name, plate_name,
    #Plateform, Operator, Creation Date, Description, status
    try:
        logger.info('Getting run list')
        run_list = web_parser.get_runlist(config_setting.table_run_list,
                                          config_setting.column_run_link,
                                          config_setting.column_run_status)

    except:
        logger.error('Cannot get the list of sequence runs')
        sys.exit(1)

    mapping_file = config_setting.mapping_file_name
    if not os.path.exists(mapping_file):
        mapping_backup = open(mapping_file, 'w')
        mapping_backup.write('run_name\trun_description\n')
        mapping_backup.flush()
        mapping_backup.close()

    #for each sequence run in the list,
    #1. check if it is a new data or re-processed data
    #2. in the case of reprocessed data: remove the data and related information in the sqlite database
    #3. in the case of new/reprocessed data: download the data, insert the information of the data into database tables
    package_downloaded = 0
    number_tries = int(config_setting.number_retries) + 1
    while number_tries > 0:
        logger.info('==== number of tries: %s ' %
                    (int(config_setting.number_retries) + 2 - number_tries))
        number_tries -= 1
        retry_list = []
        for run_url in run_list:
            try:
                run_info = web_parser.get_runinfo(run_url)
            except:
                logger.warn('Cannot get run_info for run_url ( %s )' %
                            (run_url))
                retry_list.append(run_url)
                continue
            try:
                lane_list, file_list = web_parser.get_laneinfo(
                    run_url, config_setting.table_file_list,
                    config_setting.column_lane,
                    config_setting.column_file_link)
            except:
                logger.warn(
                    'Cannot get lane_list and file_list for run_name %s)' %
                    (run_info['run_name']))
                retry_list.append(run_url)
                continue

            multiple_lane = len(lane_list)
            for a_lane in lane_list:
                folder_name = run_info['run_name']
                if multiple_lane > 1:
                    folder_name = run_info['run_name'] + '_lane' + str(
                        a_lane['lane_index'])
                #if int(a_lane['http_content_length']) > 10700000000:
                #   logger.warn('Data size %s > 10GB, skip the data' % (a_lane['http_content_length']))
                #   continue
                case = lims_database.get_run_case(run_info, a_lane)
                if case == lims_database.RUN_OLD:
                    logger.info(
                        'Data already downloaded (run_name %s, lane_index %s)'
                        % (run_info['run_name'], a_lane['lane_index']))
                if case == lims_database.RUN_REPROCESSED:
                    logger.info(
                        'Deleting records in database for re-processed data (run_name %s, lane_index %s)'
                        % (run_info['run_name'], a_lane['lane_index']))
                    lims_database.delete_old_run(run_info, a_lane)

                if case == lims_database.RUN_REPROCESSED or case == lims_database.RUN_NEW:
                    logger.info(
                        'Downloading new/re-processed data (run_name %s, lane_index %s)'
                        % (run_info['run_name'], a_lane['lane_index']))
                    output_path = os.path.join(
                        config_setting.destination_folder,
                        a_lane['package_name'])
                    time_and_size = web_parser.download_zipfile(
                        a_lane['pack_data_url'], output_path)
                    if a_lane['http_content_length'] != time_and_size[2]:
                        logger.warn(
                            'Downloaded file size %s is different with the http_content_length %s'
                            %
                            (time_and_size[2], a_lane['http_content_length']))
                        os.unlink(output_path)
                        retry_list.append(run_url)
                    else:
                        sequence_run = SequenceRun(
                            a_lane, folder_name, file_list,
                            config_setting.destination_folder,
                            config_setting.folder_mode,
                            config_setting.file_mode)
                        if sequence_run.unzip_package(
                                time_and_size[2],
                                a_lane['http_content_length']):
                            sequence_run.rename_files()
                            package_downloaded += 1

                            mapping_backup = open(mapping_file, 'a')
                            a_string = run_info['run_name'] + '\t' + run_info[
                                'description'] + '\n'
                            mapping_backup.write(a_string)
                            mapping_backup.flush()
                            mapping_backup.close()

                            rowid = lims_database.insert_run_info(
                                run_info, action_id)
                            lims_database.insert_file_info(
                                rowid, sequence_run.file_info,
                                a_lane['lane_index'])
                            lims_database.insert_package_info(
                                rowid, time_and_size)
                            lims_database.insert_lane_info(
                                rowid, run_url, a_lane)
                            lims_database.update_package_downloaded(
                                package_downloaded, action_id)