Esempi in Python per MyHTMLParser, esempi in Python per htmlparser.MyHTMLParser

Esempio n. 1

0

Mostra file

File: wechatno_wemall.py Progetto: wuqundong520/interface_testlink_python_v3.0

    def test_to_buy_goods(self):
        headers = cookie
        self.http.set_header(headers)

        self.params['serial'] = global_serial

        logger.info('正在发起GET请求...')
        self.params = urllib.parse.urlencode(
            self.params)  # 将参数转为url编码字符串# 注意，此处params为字典类型的数据
        response = self.http.get(self.url, self.params)
        status_code = response[2]
        response_body = response[0].decode('utf-8')

        logger.info('正在解析返回结果:%s' % response_body)

        # 解析HTML文档
        parser = MyHTMLParser(strict=False)
        parser.feed(response_body)
        starttag_data = parser.get_starttag_data()

        tab_page_title = ''
        for data in starttag_data:
            if data[1] == self.expected_result['tab_page_title']:
                tab_page_title = data[1]
                break

        # 断言
        self.assertEqual(status_code,
                         self.expected_result['status'],
                         msg='http状态码status不等于200')
        self.assertEqual(tab_page_title,
                         self.expected_result['tab_page_title'],
                         msg='无法打开商品详情')

Esempio n. 2

0

Mostra file

File: webtours.py Progetto: wuqundong520/interface_testlink_python_v3.0

    def test_visit_webtours(self):
        # 根据被测接口的实际情况，合理的添加HTTP头
        # header = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        #    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0'
        #    }
        logger.info('正在发起GET请求...')
        response = self.http.get(self.url, (self.params))
        response_body = response[0]
        response_body = response_body.decode('utf-8')

        logger.info('正在解析返回结果')

        # 解析HTML文档
        parser = MyHTMLParser(strict=False)
        parser.feed(response_body)

        # 比较结果
        starttag_data = parser.get_starttag_data()
        i = 0
        for data_list in starttag_data:
            if data_list[0] == 'title' and data_list[1] == 'Web Tours':
                i = i + 1

        self.assertNotEqual(str(i),
                            self.expected_result['result'],
                            msg='访问WebTours失败')

Esempio n. 3

0

Mostra file

File: datasetparser.py Progetto: GOODDASHA/Project

class DataSetParser(MyHTMLParser):
    def __init__(self, filename):
        self.parser = MyHTMLParser()

        self.HTMLdata=open (filename, "rb").read().replace('\n', '')

    def parse_HTML(self):
        self.parser.feed(self.HTMLdata)
        self.dataset = self.parser.getdataset()

    def getdataset(self):
        return self.dataset

    def gettags(self):
        return self.parser.gettags()

    def classifydata(self, filename):
        self.buildsubjectset()

        with open(filename, "w") as f:
            writer = csv.writer(f, delimiter=',')
            writer.writerow(['subject', 'dokid'])
            for subject in self.subjectset:
                for data in self.filtered_dataset:
                    temp = [x.strip() for x in data['asca']]
                    if subject in temp:
                        writer.writerow([subject, data['dokid'][0]])


    def buildsubjectset(self):
        self.subjectset = [x.strip() for x in self.parser.getsubject()]
        self.subjectset.sort()


    def writesubject(self, filename):
        self.buildsubjectset()
        with open(filename, "w") as text_file:
            for x in self.subjectset:
                text_file.write(x + '\n')

            text_file.close()

    def filtertags(self, tagname = 'refunifids'):
        self.filtered_dataset = [data for data in self.dataset if tagname in data]
        return self.filtered_dataset

    def writeCsv(self, attlist, output_filename):
        
        with open(output_filename, 'wb') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerow(attlist)
            for line in self.filtered_dataset:
                data = []
                for att in attlist:
                    if att in line:
                        data.append(line[att])
                    else:
                        data.append('')
                writer.writerow(data)

Esempio n. 4

0

Mostra file

def from_html_to_dict(text, vars):
    if (text.startswith("<a") and text.endswith("</a>")):
        parser = MyHTMLParser()
        parser.feed(text)
        return parser.value
    elif text.replace("[[", "").replace("]]", "") in vars.keys():
        return text
    else:
        return int(text)

Esempio n. 5

0

Mostra file

File: search.py Progetto: CyberIntelMafia/datawake-prefetch

def search_google(term):
    quoted_term = '"%s"' % term
    parser = MyHTMLParser()
    (html, user_agent_google) = google.get_search_results(quoted_term)
    parser.feed(html)
    # remove the last result which is the google location based search about
    # page
    del parser.get_results()[-1]
    return parser.get_results()

Esempio n. 6

0

Mostra file

    def test_visit_taobaoapi(self):
        # 根据被测接口的实际情况，合理的添加HTTP头
        # header = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        #    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0'
        #    }
        logger.info('正在发起GET请求...')
        response = self.http.get(self.url, (self.params))

        logger.info('正在解析返回结果')

        # 解析HTML文档
        parser = MyHTMLParser(strict=False)
        parser.feed(response)

Esempio n. 7

0

Mostra file

File: main.py Progetto: pombredanne/MailingListStats

    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location, self.web_user,
                                  self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output('Current month detected: '
                                        'Found substring %s in URL %s...' %
                                        (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives

Esempio n. 8

0

Mostra file

File: main.py Progetto: cagomezt/MailingListStats

    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location,
                                  self.web_user, self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output(
                        'Current month detected: '
                        'Found substring %s in URL %s...' % (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives

Esempio n. 9

0

Mostra file

File: backends.py Progetto: GregSutcliffe/MailingListStats

    def fetch(self):
        """Get all the links listed in the Mailing List's URL.

        The archives are usually retrieved in descending chronological
        order (newest archives are always shown on the top of the archives).
        Reverse the list to analyze in chronological order.
        """

        mailing_list = self.mailing_list

        htmlparser = MyHTMLParser(mailing_list.location,
                                  self.web_user, self.web_password)
        # links = htmlparser.get_mboxes_links(self.force)
        links = self.filter_links(htmlparser.get_links())

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self._print_output(
                        'Current month detected: '
                        'Found substring %s in URL %s...' % (this_month, link))
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
                elif os.path.exists(destfilename) and not self.force:
                    self._print_output('Already downloaded %s' % link)
                else:
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
            except IOError:
                self._print_output("Unknown URL: " + link + ". Skipping.")
                continue

            yield MBoxArchive(destfilename, link)

Esempio n. 10

0

Mostra file

    def fetch(self):
        """Get all the links listed in the Mailing List's URL.

        The archives are usually retrieved in descending chronological
        order (newest archives are always shown on the top of the archives).
        Reverse the list to analyze in chronological order.
        """

        mailing_list = self.mailing_list

        htmlparser = MyHTMLParser(mailing_list.location, self.web_user,
                                  self.web_password)
        # links = htmlparser.get_mboxes_links(self.force)
        links = self.filter_links(htmlparser.get_links())

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self._print_output('Current month detected: '
                                       'Found substring %s in URL %s...' %
                                       (this_month, link))
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
                elif os.path.exists(destfilename) and not self.force:
                    self._print_output('Already downloaded %s' % link)
                else:
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
            except IOError:
                self._print_output("Unknown URL: " + link + ". Skipping.")
                continue

            yield MBoxArchive(destfilename, link)

Esempio n. 11

0

Mostra file

File: wechatno_wemall.py Progetto: wuqundong520/interface_testlink_python_v3.0

    def test_click_goods(self):
        headers = cookie
        self.http.set_header(headers)

        mall_goods_id = self.params['id']

        logger.info('正在发起GET请求...')
        self.params['serial'] = global_serial
        self.params = urllib.parse.urlencode(
            self.params)  # 将参数转为url编码字符串# 注意，此处params为字典类型的数据
        response = self.http.get(self.url, self.params)
        status_code = response[2]
        # logger.info('正在解析返回结果:%s' % response[0].decode('utf-8'))

        # 解析HTML文档
        parser = MyHTMLParser(strict=False)
        parser.feed(response[0].decode('utf-8'))
        starttag_data = parser.get_starttag_data()

        query = 'SELECT name FROM mall_goods WHERE id=%s'
        data = (mall_goods_id, )
        mall_goods_name = saofudb.select_one_record(query, data)
        mall_goods_name = mall_goods_name[0]
        self.expected_result['goods_name'] = mall_goods_name

        goods_name = ''
        for data in starttag_data:
            if data[1].find(mall_goods_name) != -1:
                goods_name = data[1].replace('\r', '')
                goods_name = goods_name.replace('\n', '')
                goods_name = goods_name.replace('\t', '')

        # 断言
        self.assertEqual(status_code,
                         self.expected_result['status'],
                         msg='http状态码status不等于200')
        self.assertEqual(goods_name,
                         self.expected_result['goods_name'],
                         msg='无法打开商品详情')

Esempio n. 12

0

Mostra file

def search_google(term):
    quoted_term = '"%s"' % term
    parser = MyHTMLParser()
    (html, user_agent_google) = google.get_search_results(quoted_term)
    parser.feed(html)
    # remove the last result which is the google location based search about
    # page
    del parser.get_results()[-1]
    return parser.get_results()

Esempio n. 13

0

Mostra file

File: wechatno_wemall.py Progetto: wuqundong520/interface_testlink_python_v3.0

    def test_pay5(self):
        headers = cookie
        self.http.set_header(headers)

        self.params['orderId'] = CMOrder.attach
        self.params['serial'] = global_serial

        logger.info('正在发起GET请求...')
        self.params = urllib.parse.urlencode(
            self.params)  # 将参数转为url编码字符串# 注意，此处params为字典类型的数据
        response = self.http.get(self.url, self.params)
        response_headers = response[1]
        response_body = response[0].decode('utf-8')
        # logger.info(response_body)

        # 解析HTML文档
        parser = MyHTMLParser(strict=False)
        parser.feed(response_body)
        starttag_data = parser.get_starttag_data()

        page_title = ''
        button_name = ''
        for data in starttag_data:
            if data[1] == self.expected_result['page_title']:
                page_title = data[1]
            if data[1] == self.expected_result['button_name']:
                button_name = data[1]
                break

        # 断言
        self.assertEqual(page_title,
                         self.expected_result['page_title'],
                         msg='打开页面不是储值卡支付界面')
        self.assertEqual(button_name,
                         self.expected_result['button_name'],
                         msg='无法打开确认支付页面')

Esempio n. 14

0

Mostra file

    def __init__(self, str_id: str, str_name: str, str_content: str,
                 str_link: str, str_type: str):
        # Class vars
        self.id = str_id
        # parse url decoding
        self.name = translate_url(str_name)
        self.name_small_list = list()
        self.content = ""
        self.link = str_link  # Item link / slug
        self.type = str_type  # Item type

        # Strip unneeded html tags from post content
        html_remover = MyHTMLParser()
        html_remover.feed(str_content)

        # Strip remaining non HTML tags (such as [h5p id="#"])
        self.content = sub(r" ?\[[^)]+\]", " " + settings_dict["h5p"] + " ",
                           html_remover.html_text)

        html_remover.close()

Esempio n. 15

0

Mostra file

File: scraper.py Progetto: Kmeiklejohn/backend-web-scraper

# -*- coding: utf-8 -*-
"""
This is a program that scrapes web pages for
emails, urls, and phone numbers.
"""

__author__ = "Kyle Meiklejohn"

import argparse
import requests
import re
import pprint
from htmlparser import MyHTMLParser


parser = MyHTMLParser()


def request_func(url):
    """
    This function dodes a get request on a cmd line url.
    """
    r = requests.get(url)
    html_string = r.content
    with open('nook.html', 'w') as file:
        file.write(html_string)
    return html_string


def html_parse_data(html_string):
    """

Esempio n. 16

0

Mostra file

File: twitter_crawl.py Progetto: anuragknp/CSCI544-MBTI-tagger

import time
import os
import json
from htmlparser import MyHTMLParser


for tag in ['INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP', 'ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'INFJ', 'INFP', 'ESTP']:
  ts=2612
  curl_str = "curl 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%23" + tag + "%20lang%3Aar&src=typd&include_available_features=1&include_entities=1&last_note_ts=2516&max_position=TWEET-711340624943906817-718124708412203008-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA&reset_error_state=false' -H 'accept-encoding: gzip, deflate, sdch' -H 'x-requested-with: XMLHttpRequest' -H 'accept-language: en-US,en;q=0.8' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'referer: https://twitter.com/search?f=tweets&vertical=default&q=%23ISFP%20lang%3Aar&src=typd' -H 'authority: twitter.com' -H 'cookie: guest_id=v1%3A143863298058930111; mp_c3de24deb6a3f73fba73a616bb625130_mixpanel=%7B%22distinct_id%22%3A%20%227851b24a2bf2c7756fe7a387d4a02f3c71ef869d436e1e29099f9a08eefb812c%22%2C%22isAdmin%22%3A%20false%2C%22isAccountSpending%22%3A%20false%2C%22serviceLevel%22%3A%20%22null%22%2C%22goalBased%22%3A%20true%7D; eu_cn=1; kdt=6CB6J5Euwi4vSd6f87cx1xUNoW5QFzRSRJalTKFv; remember_checked_on=1; auth_token=43da469332b0c6afff154a20904cf4b538412d9b; pid=\"v3:1458750681895591692855842\"; __utma=43838368.234037132.1438633240.1459465162.1459465162.1; __utmz=43838368.1459465162.1.1.utmcsr=t.co|utmccn=(referral)|utmcmd=referral|utmcct=/Hs5aCk1uqi; lang=en; twitter_ads_id=v1_716178566988214277; external_referer=padhuUp37zjgzgv1mFWxJ12Ozwit7owX|0; _ga=GA1.2.234037132.1438633240; _gat=1; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6Rmxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCJXH09NTAToMY3NyZl9p%250AZCIlNWIxYjdjZjJjY2FmMGQ3Yzc0ZDE4MmNlMmU1OTA1ODE6B2lkIiVkOGIz%250AY2UwNWVjMTczYzUxMzUwYzc5ZGEzMTU2YmI4Yg%253D%253D--516ce1de0aff3a3c1e0181febb7482f25f4aa224; ua=\"f5,m2,m5,rweb,msw\"' --compressed -o a.txt"
  os.system(curl_str)
  c = 0
  while True:
    c += 1
    with open('a.txt') as f:
      out = json.loads(f.read())
      last = out["inner"]["min_position"]
      html = out["inner"]["items_html"]
      if len(html.replace('\n', '')) == 0 or c > 1000:
        break
      #print html
      with open(tag+'.txt', 'ab') as file:
        parser = MyHTMLParser(file)
        parser.feed(html)
      ts += 1
      next_curl_str = "curl 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%23ISFP%20lang%3Aar&src=typd&include_available_features=1&include_entities=1&last_note_ts="+str(ts)+"&max_position="+last+"&reset_error_state=false' -H 'accept-encoding: gzip, deflate, sdch' -H 'x-requested-with: XMLHttpRequest' -H 'accept-language: en-US,en;q=0.8' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'referer: https://twitter.com/search?f=tweets&vertical=default&q=%23ISFP%20lang%3Aar&src=typd' -H 'authority: twitter.com' -H 'cookie: guest_id=v1%3A143863298058930111; mp_c3de24deb6a3f73fba73a616bb625130_mixpanel=%7B%22distinct_id%22%3A%20%227851b24a2bf2c7756fe7a387d4a02f3c71ef869d436e1e29099f9a08eefb812c%22%2C%22isAdmin%22%3A%20false%2C%22isAccountSpending%22%3A%20false%2C%22serviceLevel%22%3A%20%22null%22%2C%22goalBased%22%3A%20true%7D; eu_cn=1; kdt=6CB6J5Euwi4vSd6f87cx1xUNoW5QFzRSRJalTKFv; remember_checked_on=1; auth_token=43da469332b0c6afff154a20904cf4b538412d9b; pid=\"v3:1458750681895591692855842\"; __utma=43838368.234037132.1438633240.1459465162.1459465162.1; __utmz=43838368.1459465162.1.1.utmcsr=t.co|utmccn=(referral)|utmcmd=referral|utmcct=/Hs5aCk1uqi; lang=en; twitter_ads_id=v1_716178566988214277; external_referer=padhuUp37zjgzgv1mFWxJ12Ozwit7owX|0; _ga=GA1.2.234037132.1438633240; _gat=1; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6Rmxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCJXH09NTAToMY3NyZl9p%250AZCIlNWIxYjdjZjJjY2FmMGQ3Yzc0ZDE4MmNlMmU1OTA1ODE6B2lkIiVkOGIz%250AY2UwNWVjMTczYzUxMzUwYzc5ZGEzMTU2YmI4Yg%253D%253D--516ce1de0aff3a3c1e0181febb7482f25f4aa224; ua=\"f5,m2,m5,rweb,msw\"' --compressed -o a.txt"
      os.system(next_curl_str)
      time.sleep(1)

Esempio n. 17

0

Mostra file

File: datasetparser.py Progetto: GOODDASHA/Project

    def __init__(self, filename):
        self.parser = MyHTMLParser()

        self.HTMLdata=open (filename, "rb").read().replace('\n', '')

Esempio n. 18

0

Mostra file

File: twitter_crawl.py Progetto: raadbintareaf/CSCI544-MBTI-tagger

import os
import json
from htmlparser import MyHTMLParser

for tag in [
        'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP', 'ENFJ', 'ENFP', 'ENTJ',
        'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'INFJ', 'INFP', 'ESTP'
]:
    ts = 2612
    curl_str = "curl 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%23" + tag + "%20lang%3Aar&src=typd&include_available_features=1&include_entities=1&last_note_ts=2516&max_position=TWEET-711340624943906817-718124708412203008-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA&reset_error_state=false' -H 'accept-encoding: gzip, deflate, sdch' -H 'x-requested-with: XMLHttpRequest' -H 'accept-language: en-US,en;q=0.8' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'referer: https://twitter.com/search?f=tweets&vertical=default&q=%23ISFP%20lang%3Aar&src=typd' -H 'authority: twitter.com' -H 'cookie: guest_id=v1%3A143863298058930111; mp_c3de24deb6a3f73fba73a616bb625130_mixpanel=%7B%22distinct_id%22%3A%20%227851b24a2bf2c7756fe7a387d4a02f3c71ef869d436e1e29099f9a08eefb812c%22%2C%22isAdmin%22%3A%20false%2C%22isAccountSpending%22%3A%20false%2C%22serviceLevel%22%3A%20%22null%22%2C%22goalBased%22%3A%20true%7D; eu_cn=1; kdt=6CB6J5Euwi4vSd6f87cx1xUNoW5QFzRSRJalTKFv; remember_checked_on=1; auth_token=43da469332b0c6afff154a20904cf4b538412d9b; pid=\"v3:1458750681895591692855842\"; __utma=43838368.234037132.1438633240.1459465162.1459465162.1; __utmz=43838368.1459465162.1.1.utmcsr=t.co|utmccn=(referral)|utmcmd=referral|utmcct=/Hs5aCk1uqi; lang=en; twitter_ads_id=v1_716178566988214277; external_referer=padhuUp37zjgzgv1mFWxJ12Ozwit7owX|0; _ga=GA1.2.234037132.1438633240; _gat=1; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6Rmxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCJXH09NTAToMY3NyZl9p%250AZCIlNWIxYjdjZjJjY2FmMGQ3Yzc0ZDE4MmNlMmU1OTA1ODE6B2lkIiVkOGIz%250AY2UwNWVjMTczYzUxMzUwYzc5ZGEzMTU2YmI4Yg%253D%253D--516ce1de0aff3a3c1e0181febb7482f25f4aa224; ua=\"f5,m2,m5,rweb,msw\"' --compressed -o a.txt"
    os.system(curl_str)
    c = 0
    while True:
        c += 1
        with open('a.txt') as f:
            out = json.loads(f.read())
            last = out["inner"]["min_position"]
            html = out["inner"]["items_html"]
            if len(html.replace('\n', '')) == 0 or c > 1000:
                break
            #print html
            with open(tag + '.txt', 'ab') as file:
                parser = MyHTMLParser(file)
                parser.feed(html)
            ts += 1
            next_curl_str = "curl 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%23ISFP%20lang%3Aar&src=typd&include_available_features=1&include_entities=1&last_note_ts=" + str(
                ts
            ) + "&max_position=" + last + "&reset_error_state=false' -H 'accept-encoding: gzip, deflate, sdch' -H 'x-requested-with: XMLHttpRequest' -H 'accept-language: en-US,en;q=0.8' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' -H 'accept: application/json, text/javascript, */*; q=0.01' -H 'referer: https://twitter.com/search?f=tweets&vertical=default&q=%23ISFP%20lang%3Aar&src=typd' -H 'authority: twitter.com' -H 'cookie: guest_id=v1%3A143863298058930111; mp_c3de24deb6a3f73fba73a616bb625130_mixpanel=%7B%22distinct_id%22%3A%20%227851b24a2bf2c7756fe7a387d4a02f3c71ef869d436e1e29099f9a08eefb812c%22%2C%22isAdmin%22%3A%20false%2C%22isAccountSpending%22%3A%20false%2C%22serviceLevel%22%3A%20%22null%22%2C%22goalBased%22%3A%20true%7D; eu_cn=1; kdt=6CB6J5Euwi4vSd6f87cx1xUNoW5QFzRSRJalTKFv; remember_checked_on=1; auth_token=43da469332b0c6afff154a20904cf4b538412d9b; pid=\"v3:1458750681895591692855842\"; __utma=43838368.234037132.1438633240.1459465162.1459465162.1; __utmz=43838368.1459465162.1.1.utmcsr=t.co|utmccn=(referral)|utmcmd=referral|utmcct=/Hs5aCk1uqi; lang=en; twitter_ads_id=v1_716178566988214277; external_referer=padhuUp37zjgzgv1mFWxJ12Ozwit7owX|0; _ga=GA1.2.234037132.1438633240; _gat=1; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6Rmxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCJXH09NTAToMY3NyZl9p%250AZCIlNWIxYjdjZjJjY2FmMGQ3Yzc0ZDE4MmNlMmU1OTA1ODE6B2lkIiVkOGIz%250AY2UwNWVjMTczYzUxMzUwYzc5ZGEzMTU2YmI4Yg%253D%253D--516ce1de0aff3a3c1e0181febb7482f25f4aa224; ua=\"f5,m2,m5,rweb,msw\"' --compressed -o a.txt"
            os.system(next_curl_str)
            time.sleep(1)