Python HtmlParserの例、htmlParser.HtmlParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: startCrawler.py プロジェクト: Wenlu057/wechat_crawler

 def start_crawler(self):
     res_gzh_list, sogou_request_flag = self.__get_first_gzh_from_result_list(
     )
     if sogou_request_flag == CrawlerConst.program_output_code.REQUEST_BLOCKED:
         return sogou_request_flag
     url_profile, first_gzh_name = HtmlParser.parse_gzh_list_html(
         res_gzh_list)
     if first_gzh_name != self.gzh_name:
         # print("找不到指定的公众号！请重新确认此公众号是否存在。")
         return CrawlerConst.program_output_code.OTHER_ERROR
     res_gzh_profile, wechat_return_flag = self.__get_gzh_articles_dict(
         url_profile)
     if wechat_return_flag == CrawlerConst.program_output_code.REQUEST_BLOCKED:
         return wechat_return_flag
     articles = HtmlParser.parse_history_article_list_html(res_gzh_profile)
     for article in articles:
         url_content = article['content_url']
         print(url_content)
         res_gzh_article, wechat_request_flag = self.__get_gzh_article(
             url_content)
         if wechat_request_flag == CrawlerConst.program_output_code.REQUEST_BLOCKED:
             return wechat_request_flag
         save_res = HtmlParser.parse_history_article_html(
             res_gzh_article, self.gzh_name)
         if save_res == CrawlerConst.program_output_code.SAVE_FAILURE:
             return save_res
     return CrawlerConst.program_output_code.SUCCESS

コード例 #2

0

ファイルを表示

 def __init__(self):
     # 初始化程序
     self.download = Downloader()
     self.parser = HtmlParser()
     self.save = SaveData()
     self.workbook = Workbook()
     self.ch = Choice()
     print('初始化完成...')

コード例 #3

0

ファイルを表示

def updateBlockTable():
    for blockId in query.getIdBlocks():
        print(blockId)
        htmlParser = HtmlParser(str(blockId))
        blockInfo = htmlParser.getBlock()
        query.insertBlock(
            Block(id=blockId,
                  hash='',
                  timestamp=blockInfo['timestamp'],
                  minedIn=blockInfo['minedIn']))

コード例 #4

0

ファイルを表示

class SpiderMain(object):
    def __init__(self):
        # 初始化程序
        self.download = Downloader()
        self.parser = HtmlParser()
        self.mysql = Mysqldb()

    def run(self, url, database):
        response = self.download.download(url)
        self.parser.parser(response, database)

コード例 #5

0

ファイルを表示

 def getTxsData(self):
     hash = self._getNotConfirmedTx()
     print(hash)
     htmlParser = HtmlParser(hash)
     if hasattr(htmlParser, 'tableText'):
         return {
             'hash': hash,
             'blockId': htmlParser.getBlockNumber(),
             'gasPrice': htmlParser.getGasPrice(),
             'gasLimit': htmlParser.getGasLimit()
         }
     return self._getFakeDataTx(hash)

コード例 #6

0

ファイルを表示

class SpiderMain(object):
    def __init__(self):
        # 初始化程序
        self.download = Downloader()
        self.parser = HtmlParser()
        self.save = SaveData()
        self.workbook = Workbook()
        self.ch = Choice()
        print('初始化完成...')

    def run(self):
        while True:
            try:
                p = int(input('想要爬多少页的数据？' + '\n'))
                break
            except ValueError:
                print('输入错误！请输入数字')
        page = p + 1
        print("================================")
        print(' A.原创发布区     B.精品软件区    ')
        print(' C.脱壳破解区     D.移动安全区    ')
        print(' E.病毒分析区     F.编程语言区    ')
        print(' G.软件调试区     H.动画发布区    ')
        print(' I.逆向资源区     J.安全工具区    ')
        print(' K.招聘求职区                    ')
        print("================================")
        while True:
            choice = input("选择爬取的专区，输入 Q 退出程序(输入的字母必须大写)：")
            half_url, name = self.ch.make_the_arrg(choice)
            if name != 'Error':
                break
        print(half_url + '\n' + name)
        self.save.createfile(name)
        for i in range(1, page):
            url = half_url + str(i) + '.html'
            response = self.download.download(url)
            self.parser.parser(response, name)
            sleep = random.randint(2, 10)
            print('爬取第' + str(i) + '页完成，程序休息' + str(sleep) + '秒')
            time.sleep(sleep)  # 程序睡眠
            if i != page - 1:
                print('-----------------------------')
                print('          下一页              ')
                print('-----------------------------')
        print('数据写入完成，正在进行数据去重...')
        self.save.delete_same_data()
        try:
            self.workbook.save('将csv的数据导入此表.xlsx')
        except:
            print('创建xlsx文件失败，请手动创建')
        print('程序运行完毕')

コード例 #7

0

ファイルを表示

    def fit(input_dir_non_reg='train_pages/non_reg/',
            input_dir_reg='train_pages/reg',
            output_dir='ml/cb'):
        train_htmls = []
        train_classes = []
        for file in os.scandir(input_dir_non_reg):
            filename = os.fsdecode(file)
            with open(filename) as file1:
                train_htmls.append(file1.read())
                train_classes.append(0)
        for file in os.scandir(input_dir_reg):
            filename = os.fsdecode(file)
            with open(filename) as file1:
                train_htmls.append(file1.read())
                train_classes.append(1)

        train_texts = []
        for html in train_htmls:
            train_texts.append(HtmlParser.htmlToText(html))

        vectorizer = CountVectorizer()
        train_set = vectorizer.fit_transform(train_texts)

        clfCB = CatBoostClassifier(iterations=100, learning_rate=3, depth=7)
        clfCB.fit(X=train_set.toarray(), y=train_classes)
        saved_vect = open(output_dir + '/saved_vect_cb', 'wb')
        pickle.dump(vectorizer, saved_vect)
        saved_vect.close()
        saved_model = open(output_dir + '/saved_clf_cb', 'wb')
        pickle.dump(clfCB, saved_model)
        saved_model.close()

コード例 #8

0

ファイルを表示

ファイル: spider.py プロジェクト: harry-fan/spider

class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_urls()
               and self.manager.old_url_size() < 100):
            #try:
            # 获取新的url
            new_url = self.manager.get_new_url()
            # 下载器下载网页
            html = self.downloader.download(new_url)
            # 解析器抽取网页数据
            new_urls, data = self.parser.parser(new_url, html)
            # 添加UR管理器
            self.manager.add_new_urls(new_urls)
            # 数据存储文件
            self.output.store_data(data)
            print("已经抓取 %s 个链接" % self.manager.old_url_size())
            #except Exception, e:
            #    print("crawl failded", e)
        self.output.out_put_html()

コード例 #9

0

ファイルを表示

class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.parser = HtmlParser()
        self.downloader = HtmlDownloader()
        self.output = DataOutput()

    def crawl(self, root_url):
        """
        程序主逻辑
        :param root_url: 入口 url
        :return:
        """
        self.manager.add_new_url(root_url)
        while self.manager.has_new_url() and self.manager.old_url_size() < 20:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.downloader(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.output_txt(data)
                print(data)
                print("爬取了{}条链接".format(self.manager.old_url_size()))
            except Exception as e:
                print("爬取失败", e)

コード例 #10

0

ファイルを表示

ファイル: spiderMan.py プロジェクト: luc99hen/spider

 def __init__(self):
     #开启的线程数目
     self.pcount = 1
     #结果输出队列
     self.dqueue = queue.Queue()
     #错误信息输出队列
     self.equeue = queue.Queue()
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
     # self.proxies = getProxy()
     self.proxies = getFromPool2()
     self.inactivepro = []
     self.count = 0
     self.sumSuccess = 0
     self.sumFail = 0
     self.updating = False

コード例 #11

0

ファイルを表示

 def getPendingTxsHashes(self):
     pendingTxs = []
     htmlParser = HtmlParser()
     for pendingTx in htmlParser.getPendingTxs():
         hash = self._getHashFromHtml(pendingTx)
         htmlParser = HtmlParser(hash)
         print(hash)
         if hasattr(htmlParser, 'tableText'):
             pendingTxs.append([hash, htmlParser.getTimestamp()])
     return pendingTxs

コード例 #12

0

ファイルを表示

ファイル: spiderMan.py プロジェクト: xieys/webSpider

class SpiderMan(object):
    def __init__(self):
        self.manger = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = OutputData()

    def crawl(self, root_url):
        """
        主程序
        :param root_url: 入口 URL
        :return:
        """
        self.manger.add_new_url(root_url)
        while self.manger.has_new_url() and self.manger.old_urls_size() < 5:
            new_url = self.manger.get_new_url()
            html = self.downloader.downloader(new_url)
            next_url, data = self.parser.parser(new_url, html)
            self.manger.add_new_url(next_url)
            self.output.outputTxt(data)

コード例 #13

0

ファイルを表示

 def predict(input_dir, clf_dir="ml/cb"):
     saved_model = open(clf_dir + "/saved_clf_cb", 'rb')
     clfCB = pickle.load(saved_model)
     saved_vect = open(clf_dir + "/saved_vect_cb", 'rb')
     vectorizer = pickle.load(saved_vect)
     htmls = []
     filenames = []
     for file in os.scandir(input_dir):
         filename = os.fsdecode(file)
         filenames.append(filename)
         with open(file) as file1:
             htmls.append(file1.read())
     texts = []
     for html in htmls:
         texts.append(HtmlParser.htmlToText(html))
     test_set = vectorizer.transform(texts)
     predict = clfCB.predict(test_set.toarray())
     for file, p in zip(filenames, predict):
         print(file)
         if p:
             print('Yes')
         else:
             print("No")
     return

コード例 #14

0

ファイルを表示

 def __init__(self):
     # 初始化程序
     self.download = Downloader()
     self.parser = HtmlParser()
     self.mysql = Mysqldb()

コード例 #15

0

ファイルを表示

ファイル: spiderMan.py プロジェクト: luc99hen/spider

class SpiderMan(object):
    def __init__(self):
        #开启的线程数目
        self.pcount = 1
        #结果输出队列
        self.dqueue = queue.Queue()
        #错误信息输出队列
        self.equeue = queue.Queue()
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
        # self.proxies = getProxy()
        self.proxies = getFromPool2()
        self.inactivepro = []
        self.count = 0
        self.sumSuccess = 0
        self.sumFail = 0
        self.updating = False
        #self.proxies = ['http://127.0.0.1:2740']

    def doCrawl(self, new_url):
        try:
            self.pcount += 1
            count = 1
            #随机选取代理IP
            pro = random.choice(self.proxies)
            #pro = 'http://127.0.0.1:2740'
            while (True):
                #HTML下载器下载网页
                html = self.downloader.download(new_url, pro)
                #HTML解析器抽取网页数据
                data = self.parser.parser(new_url, html)
                ## 数据存储器储存文件引起多线程写冲突而废弃
                # self.output.store_data(data)
                #如果遇到机器人检测
                if data == "robot":
                    if count < 6:
                        count = count + 1
                        #加入淘汰机制
                        # self.proxies - self.inactivepro 表现良好
                        # self.proxies and self.inactivepro 一次被墙 待观察
                        # self.inactivepro - self.proxies 两次被墙 暂时退出
                        # none 复活失败，永久退出
                        if (count == 5 and len(self.proxies) > 100):
                            if (self.inactivepro.index(pro) < 0):  #加入观察名单
                                self.inactivepro.append(pro)
                                pro = random.choice(self.proxies)
                            else:  #暂时退出
                                print(str(pro) + " out\n")
                                if (self.proxies.index(pro) >= 0):
                                    self.proxies.remove(pro)
                        continue
                    else:
                        raise Exception("robot check")
                else:
                    break
            # 队列将输出存储起来
            self.dqueue.put(data)
        except Exception as e:
            self.sumFail = self.sumFail + 1
            print(
                "Fail: link %d fail %d times : %s\n" %
                (self.count, self.sumFail, new_url), e.args)
            # 启动激活计划
            if (len(self.proxies) < 200 or len(self.inactivepro) > 500):
                pro = random.choice(self.inactivepro)
                if (not pro is None and self.proxies.index(pro) < 0
                        and self.testIP(pro)):
                    self.proxies.append(pro)
                    print(str(pro) + " in!!!\n")
                # 不管结果如何，都要将pro移除，
                # 判断条件是为了防止多线程并发出现问题
                if (self.inactivepro.index(pro) >= 0):
                    self.inactivepro.remove(pro)
            self.equeue.put([new_url, e.args])
        else:
            self.sumSuccess = self.sumSuccess + 1
            print("Success: link %d success %d times : %s" %
                  (self.count, self.sumSuccess, new_url))
        finally:
            self.pcount -= 1

    def setProxy(self):
        #self.proxies = getProxy()
        self.proxies = getFromPool2()
        self.updating = False

    #输出结果和错误信息
    def outPutData(self):
        while (not self.dqueue.empty()):
            data = self.dqueue.get()
            self.output.store_data(data)
        while (not self.equeue.empty()):
            err = self.equeue.get()
            self.output.store_err(err)

    def testIP(self, pro):
        url = 'https://www.douban.com'
        res = requests.get(url, proxies={'proxy': pro}, timeout=20)
        if (res.status_code == 200):
            return True
        else:
            return False

    def crawl(self):
        threads = []
        preFail = 0
        #跳过之前的url
        for i in range(22350):
            self.manager.has_new_url()
        while (self.manager.has_new_url()):
            try:
                self.count = self.count + 1
                # 启动更新计划
                if self.sumFail - preFail > 46 and not self.updating:
                    self.updating = True
                    print("\n\nstart refreshing proxies\n\n")
                    t = threading.Thread(target=SpiderMan.setProxy,
                                         args=[
                                             self,
                                         ])
                    t.start()
                    threads.append(t)
                    # p = Pool()
                    # result = p.apply_async(getFromPool2, args=())
                    # p.close()
                    #self.proxies = result.get()
                #每50条数据刷新缓冲区和成功率
                if (self.count % 50 == 0 and self.count != 0):
                    preFail = self.sumFail
                    rate = float(self.sumSuccess) / float(self.count - 1)
                    print("Success Rate: %f" % rate)
                    self.output.store_err([str(self.count), str(rate)])
                    self.output.flush()
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #爬虫主过程(多线程优化)
                if self.pcount < 0:
                    pcount = 0
                else:
                    pcount = self.pcount
                time.sleep(random.random() + pcount / 10)  #随机时间间隔，根据线程数调整速度
                t = threading.Thread(target=SpiderMan.doCrawl,
                                     args=[
                                         self,
                                         new_url,
                                     ])
                t.start()
                threads.append(t)
                #输出结果和错误信息
                self.outPutData()
            except Exception as e:
                print("wired fail")
        [t.join() for t in threads]
        self.outPutData()

コード例 #16

0

ファイルを表示

ファイル: main.py プロジェクト: S1r0hub/vrvis-thesis

def main():

    # create argument parser
    parser = argparse.ArgumentParser(
        description='Convert ConfigCrusher program measurement results.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # add arguments to parser and parse
    prepareParser(parser)
    args = parser.parse_args()

    # set up logger
    global LOGGER
    LOGGER = logging.getLogger('crusherToJSONLogger')
    LOGGER.setLevel(logging.DEBUG)

    # check if debug should be enabled
    logLevel = logging.INFO
    if args.verbose: logLevel = logging.DEBUG

    # channel to stream log events to console
    ch = logging.StreamHandler()
    ch.setLevel(logLevel)
    formatter = logging.Formatter('[%(levelname)s] (%(asctime)s): %(message)s')
    ch.setFormatter(formatter)
    LOGGER.addHandler(ch)

    # log to file if enabled
    logPath = args.logfile
    if len(logPath) > 0:
        if not logPath.endswith(".log"):
            logPath += ".log"
        fileHandler = logging.FileHandler(logPath)
        fileHandler.setFormatter(formatter)
        LOGGER.addHandler(fileHandler)
    LOGGER.info('Logger ready.')

    # validate output folder
    outFolder = args.outpath
    if not (outFolder.endswith("/") or outFolder.endswith("\\")):
        outFolder += "/"

    if not os.path.exists(outFolder):
        LOGGER.warning('The output folder does not exist! Creating it...')

        try:
            os.makedirs(outFolder)
        except Exception as ex:
            LOGGER.exception('Failed to create output folder!')
            return

        outFolder = os.path.normcase(outFolder)
        LOGGER.info('Output folder created: {}'.format(outFolder))

    else:

        # check that path leads to a folder
        if not os.path.isdir(outFolder):
            LOGGER.error('The output folder path does not lead to a folder!')
            return

    # validate color schema file
    schemaPath = args.colorschema
    if not os.path.isfile(schemaPath):
        LOGGER.error(
            'The given schema path is no valid file: {}'.format(schemaPath))
        return

    # check if recursive export is desired
    recursive = True if args.recursive else False

    # check if user wants to overwrite existing files
    overwrite = True if args.overwrite else False

    # export the highlighted HTML code as well if desired
    exportHTML = True if args.exporthtml else False
    if exportHTML: LOGGER.info('Additional HTML export enabled.')

    # try to read JSON color schema
    jsonSchema = None
    with open(schemaPath, "r") as file:
        try:
            jsonSchema = json.loads(file.read())
        except Exception as ex:
            LOGGER.error(ex)
    if jsonSchema is None: return

    # check if path exists
    filePath = args.path
    if not os.path.exists(filePath):
        LOGGER.error('Failed to convert! Given path does not exist: {}'.format(
            filePath))
        return None

    # check if path leads to file or folder
    if os.path.isfile(filePath):

        # parses html code to unity rt format
        parser = HtmlParser(colorSchema=jsonSchema)

        # convert a file and export the result
        LOGGER.info('Converting the file...')
        resultPath = convertFile(htmlParser=parser,
                                 filePath=filePath,
                                 outputFolder=outFolder,
                                 exportHTML=exportHTML,
                                 overwrite=overwrite)

    elif os.path.isdir(filePath):

        # convert all files of the folder
        LOGGER.info('Converting the files{}...'.format(
            ' recursively' if recursive else ''))
        resultPath = convertFiles(folderPath=filePath,
                                  outputFolder=outFolder,
                                  jsonSchema=jsonSchema,
                                  exportHTML=exportHTML,
                                  overwrite=overwrite,
                                  recursive=recursive)

    # print result path
    if not resultPath is None:
        LOGGER.info('Result path: ' + os.path.abspath(resultPath))

コード例 #17

0

ファイルを表示

ファイル: main.py プロジェクト: S1r0hub/vrvis-thesis

def convertFiles(folderPath,
                 outputFolder,
                 jsonSchema,
                 exportHTML=False,
                 overwrite=False,
                 recursive=False):
    '''
    Converts all files source code to a syntax highlighted rich text format.
    This method does not check if the given path is valid!
    Returns None on errors, the path to the exported files otherwise.
    '''

    firstOutPath = None
    pathLength = len(folderPath)
    if folderPath.endswith('/') or folderPath.endswith('\\'):
        folderPath = folderPath[:-1]
    srcDirName = os.path.normcase(os.path.basename(folderPath))
    outputFolder = os.path.normcase(os.path.normpath(outputFolder))

    for curDir, subDirs, files in os.walk(folderPath, topdown=True):
        curDir_relative = os.path.normpath(
            os.path.join(srcDirName, curDir[pathLength:]))
        LOGGER.info('Entering directory: {}'.format(curDir_relative))

        # create export path
        #LOGGER.debug('Joining paths "{}" and "{}"'.format(outputFolder, curDir_relative))
        curOutFolder = os.path.normcase(
            os.path.join(outputFolder, curDir_relative))
        LOGGER.debug('Current output folder: {}'.format(curOutFolder))
        if os.path.exists(curOutFolder):
            if os.path.isfile(curOutFolder):
                LOGGER.error(
                    'Failed to export to: {} (is a file instead of a folder)'.
                    format(os.path.abspath(curOutFolder)))
                return None
        else:
            # create the output folder
            LOGGER.info('Creating folder: {}'.format(curOutFolder))
            try:
                os.mkdir(curOutFolder)
            except Exception as ex:
                LOGGER.exception(
                    'Failed to create an output folder: {}'.format(
                        curOutFolder))
                return None

        if firstOutPath is None: firstOutPath = curOutFolder

        # convert and export all the files of this folder
        for file in files:

            # parses html code to unity rt format
            parser = HtmlParser(colorSchema=jsonSchema)

            LOGGER.info('Converting file: {}'.format(file))
            path = convertFile(htmlParser=parser,
                               filePath=os.path.join(curDir, file),
                               outputFolder=curOutFolder,
                               exportHTML=exportHTML,
                               overwrite=overwrite)

            if not path is None: LOGGER.info('File exported: {}'.format(path))

        # do not take sub-folders into account if recursion is disabled
        if not recursive: break

    return firstOutPath

コード例 #18

0

ファイルを表示

ファイル: mailstorm.py プロジェクト: scdt/AccountsProver

def main():
    motd = open('./motd', 'r')
    banner = motd.read()
    motd.close()
    style = colored.fg(random.choice(['red', 'blue', 'green']))
    print(colored.stylize(banner, style))
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hfdao:e:u:1:2:i:m:c:", [
            "help", "full", "download", "analyze", "output-dir=", "email=",
            "urls=", "input-dir1=", "input-dir2=", "input-dir=", "method=",
            "clf_dir"
        ])
    except getopt.GetoptError as err:
        print(err)
        sys.exit(2)

    output_dir = None
    email = None
    nr_dir1 = None
    nr_dir2 = None
    input_dir = None
    input_dir1 = None
    method = None
    clf_dir = None

    if not opts:
        print('Options required, try -h or --help for more information')
        return

    o, a = opts[0]
    if o in ("-h", "--help"):
        print("Usefull information")
    elif o in ("-f", "--fit"):
        for o, a in opts[1:]:
            if o in ("-1", "--input-dir1"):
                input_dir = a
            elif o in ("-2", "--input-dir2"):
                input_dir1 = a
            elif o in ("-o", "--output_dir"):
                output_dir = a
            elif o in ("-m", "--method"):
                method = a
            else:
                assert False, "unhandled option for fit"
        if method == "cb":
            CatBoostHtmlClassifier.fit(input_dir, input_dir1, output_dir)
        elif method == "nb":
            KNeighboursHtmlClassifier.fit(input_dir, input_dir1, output_dir)
        else:
            assert False, "method not specified"

    elif o in ("-d", "--download"):
        for o, a in opts[1:]:
            if o in ("-o", "--output-dir"):
                output_dir = a
            elif o in ("-e", "--email"):
                email = a
            elif o in ("-u", "--urls"):
                urls = a
            else:
                assert False, "unhandled option for download"
        WebSelenium.saveHtmls(email, urls, output_dir)
    elif o in ("-a", "--analyze"):
        for o, a in opts[1:]:
            if o in ("-1", "--input-dir1"):
                nr_dir1 = a
            elif o in ("-2", "--input-dir2"):
                nr_dir2 = a
            elif o in ("-i", "--input-dir"):
                input_dir = a
            elif o in ("-c", "--clf_dir"):
                clf_dir = a
            elif o in ("-m", "--method"):
                method = a
            else:
                assert False, "unhandled option for analyze"
        if method == "1":
            HtmlParser.htmlCmp(nr_dir1, nr_dir2, input_dir)
        elif method == "cb":
            CatBoostHtmlClassifier.predict(input_dir, clf_dir)
        elif method == "nb":
            KNeighboursHtmlClassifier.predict(input_dir, clf_dir)
        else:
            assert False, "method not specified"
    else:
        assert False, "unhandled option"

コード例 #19

0

ファイルを表示

 def __init__(self):
     self.manager = UrlManager()
     self.parser = HtmlParser()
     self.downloader = HtmlDownloader()
     self.output = DataOutput()

コード例 #20

0

ファイルを表示

ファイル: htmliterator.py プロジェクト: talitz/automatic-sign-in-model

import pandas as pd
from requestUtil import *
from htmlParser import HtmlParser

id=1
#Column names
COLUMN_NAMES = ["HTML_ID", "TAG_NAME", "ATTRIBUTE_ID", "ATTRIBUTE_NAME", "ATTRIBUTE_CLASS", "ATTRIBUTE_PLACEHOLDER", "IN_FORM", "TAG_DEPTH", "TAG_STRING", "LABEL"]
#Initializing dataframe
df = pd.DataFrame(columns=COLUMN_NAMES)


#Read urls from xslx file
loginurls = pd.read_csv("loginurls.csv")

#Creating parser object
htmlParser=HtmlParser()

#Iterating over all login urls
for loginurl in loginurls["LOGIN_URL"]:
   try:
      print("Requesting : " + loginurl)
      src = getHtmlString(loginurl)
      df = HtmlParser.parseHtml(src, id, df, loginurl)
      print("finished parsing html num " + str(id))
   except Exception as e:
      print("Could not load: " + loginurl)
   id = id + 1