def reset(self):
     """Initialize the parser state."""
     HTMLParser.reset(self)
     self.feed_entries = []
     self.in_feed      = False
     self.in_entry     = False
     self.curr_data    = ''
 def reset(self):
     HTMLParser.reset(self)
     self.__shouldOutputStack = [False]
     self.__unsupportedElementDepth = 0
     self.__unsupportedBlockStart = None
     self.__input = LineNumberedBuffer()
     self.__currentNode = Document()
 def reset(self):
     HTMLParser.reset(self)
     self.towns = []
     self.pcarea = ''
     self.towntable = False
     self.areatag = False
     self.towntag = False
Beispiel #4
0
 def reset(self):
     HTMLParser.reset(self)
     self.result = TarifHolder()
     self.resultKey = None
     self.reconnect = None
     self.secretKey = None
     self.method = None
Beispiel #5
0
	def getFriends(self):
		print "get friends of %s"%(self.personid)
		for i in range(10,50000,10):
			print "%d,%s"%(i,"\n" if i%100==0 else ""),
			
			for try_i in range(10):
				try:
					res=self.browser.open("http://m.facebook.com/friends/?id=%s&q&f=%d&refid=5"%(self.personid,i),timeout=5)
					restext=res.read();
					break
				except urllib2.URLError:
					print "url error retry=%d"%(try_i)
				except socket.timeout:
					print "connection problems retry=%d"%(try_i)
				#except urllib2.HTTPError as e:
				#	print "xxxxx"
				#	if e.code == 404:
				#		print "page not found -> skip!"
				#		break
						
			self.feed(restext)
			HTMLParser.reset(self)
			if "See More Friends" not in restext:
				break
		print "\n%d friends found!"%(len(self.friendslist))
		return self.friendslist
Beispiel #6
0
 def reset(self):
     HTMLParser.reset(self)
     #		self.tags = set(["document_id", "full_text", "publication_date", "author", "title", "link", "concept", "title", "document", "data", "query", "time_period", "number_of_result", "resource","subject","publication_title", "source_type", "issn"])
     self.tags = set([
         "data", "query", "time_period", "concept", "number_of_result",
         "resource", "document", "link", "title", "abstract", "inventor",
         "assignee", "filed"
     ])
     self.filed_tag = "NULL"
     self.assignee_tag = "NULL"
     self.current_tag = "NULL"
     self.end_tag = "NULL"
     self.document = "NULL"
     self.full_text = "NULL"
     self.publication_date = "NULL"
     self.authors = "NULL"
     self.abstract = "NULL"
     self.title = "NULL"
     self.subject = "NULL"
     self.link = "NULL"
     self.concept = "NULL"
     self.line_num = 0
     self.file_name = "NULL"
     self.content = ""
     self.companies = "NULL"
     self.patent_no = "NULL"
Beispiel #7
0
    def reset(self):
        HTMLParser.reset(self)

        # Used as a tag stack as teh document is being parsed
        self.stack = []

        # Stores tupes of (tag, condition) to be searched,
        # Condition a list of tuples (attribute, value)
        self.path = []

        # Is a record of the depth at which each tag in the path
        # was found, used for tag matching when finding closing tags
        self.found_path_depth = []

        # Stores the next item in the path to be searched
        self.path_index = 0

        # Maintains a count of the items in the stack
        self.stack_index = 0

        # Flag to publish found data, publishing occurs on all data items
        # found after the path has been found
        self.found_target = False

        # Flag to indicate to quit once the processing has been completed
        self.quit_on_done = False
 def reset(self):
     self.http_prefix = 'http:'
     self.posts = []
     self.add_flag = False
     self.div_depth = 0
     self.look_depth = 0
     HTMLParser.reset(self)
Beispiel #9
0
 def reset(self):
     """ Reset the instance. Loses all data. This is called implicitly at
     instantiation time. Attributes provided by the set_* methods are
     preserved. """
     HTMLParser.reset(self)
     self.__text = []
     self.__open_tags = []
 def reset(self):
     """Initialize the parser state."""
     HTMLParser.reset(self)
     self.feed_entries = []
     self.in_feed = False
     self.in_entry = False
     self.curr_data = ''
Beispiel #11
0
    def reset(self):
        HTMLParser.reset(self)

        # Used as a tag stack as teh document is being parsed
        self.stack = []

        # Stores tupes of (tag, condition) to be searched,
        # Condition a list of tuples (attribute, value)
        self.path = []

        # Is a record of the depth at which each tag in the path
        # was found, used for tag matching when finding closing tags
        self.found_path_depth = []

        # Stores the next item in the path to be searched
        self.path_index = 0

        # Maintains a count of the items in the stack
        self.stack_index = 0

        # Flag to publish found data, publishing occurs on all data items
        # found after the path has been found
        self.found_target = False

        # Flag to indicate to quit once the processing has been completed
        self.quit_on_done = False
Beispiel #12
0
 def reset(self):
     HTMLParser.reset(self)
     self.__shouldOutputStack = [False]
     self.__unsupportedElementDepth = 0
     self.__unsupportedBlockStart = None
     self.__input = LineNumberedBuffer()
     self.__currentNode = Document()
 def _reset(self):
     HTMLParser.reset(self)
     self.currentIndentLevel = 0
     self.parsedData = []
     self._inTag = []
     self.root = None
     self.doctype = None
     self.inPreformatted = 0
Beispiel #14
0
    def reset(self):

        # call superclass reset
        HTMLParser.reset(self)

        # reset the parser
        self._tokens = []
        self._params = []
Beispiel #15
0
 def reset(self):
     HTMLParser.reset(self)
     self._in_h3 = False
     self.number = None
     self.title = None
     self.img_name = None
     self.img_type = None
     self.message = None
Beispiel #16
0
 def reset(self):
     """
     Reset parser. Must be called if parser is used for more than one HTML
     page
     """
     HTMLParser.reset(self)
     self.data = [[]]
     self.record = False
Beispiel #17
0
	def reset(self):
		self.__output = ''
		self._data = ''
		self.__parse = None
		self.__parse_tag = None
		self.__parse_depth = 0

		HTMLParser.reset(self)
 def _reset(self):
     HTMLParser.reset(self)
     self.currentIndentLevel = 0
     self.parsedData = []
     self.inTag = []
     self.root = None
     self.doctype = None
     self.inPreformatted = 0
Beispiel #19
0
 def reset(self):
     """
     Reset parser. Must be called if parser is used for more than one HTML
     page
     """
     HTMLParser.reset(self)
     self.record = False
     self.expirations = []
Beispiel #20
0
    def reset(self):
        HTMLParser.reset(self)
        self.result = ""
        self.list_stack = []
        self.verbatim = 0

        self.links = []
        self.mime_handlers = [LinkHandler(), ImageHandler()]
Beispiel #21
0
 def reset(self):
     HTMLParser.reset(self)
     self.dat = AlbumData()
     self.lastTagOpen = False
     self.blockStack = []
     self.tagStack = []
     self.profileTag = ""
     self.column = ""
Beispiel #22
0
    def reset(self):

        # call superclass reset
        HTMLParser.reset(self)

        # reset the parser
        self._tokens = []
        self._params = []
Beispiel #23
0
    def reset(self):
        HTMLParser.reset(self)
        self.result = ""
        self.list_stack = []
        self.verbatim = 0

        self.links = []
        self.mime_handlers = [LinkHandler(),ImageHandler()]
Beispiel #24
0
 def reset(self):
     HTMLParser.reset(self)
     self.recording = False
     self.curtime = -1
     self.curday = -1
     self.curdata = []
     self.startDate = None
     self.initWeek = None
Beispiel #25
0
    def _reset(self):
        '''
            _reset - reset this object. Assigned to .reset after __init__ call.
        '''
        HTMLParser.reset(self)

        self.root = None
        self.doctype = None
        self._inTag = []
 def reset(self):
     """
     Make our data ready
     """
     HTMLParser.reset(self)
     self.result = []
     self.inGap = False
     self.lastGap = ''
     self.lastText = ''
Beispiel #27
0
 def reset(self):
     HTMLParser.reset(self)
     self.paragraphs = []
     self.in_paragraph = None
     self.last_href = None
     self.href_content = None
     self.in_table = None
     self.cell_content = None
     self.list_type = []
Beispiel #28
0
 def reset(self):
     HTMLParser.reset(self)
     self.text = []
     self.coveldata = []
     self.denevedata = []
     self.feastdata = []
     self.current = self.coveldata
     self.isinTag = False
     self.amp = False
Beispiel #29
0
 def reset(self):
     """
     Make our data ready
     """
     HTMLParser.reset(self)
     self.result = []
     self.inGap = False
     self.lastGap = ''
     self.lastText = ''
    def reset(self):
        self.email = []

        self.name = ""
        self.isPersonName = False
        self.isDivInfo = False
        self.isDivInfoTag = False
        self.isEmailInfo = False
        HTMLParser.reset(self)
Beispiel #31
0
 def reset(self):
     HTMLParser.reset(self)
     self.paragraphs = []
     self.in_paragraph = None
     self.last_href = None
     self.href_content = None
     self.in_table = None
     self.cell_content = None
     self.list_type = []
Beispiel #32
0
    def reset(self, query):

        query = [escape(word) for word in query]
        words = '|'.join(query)
        self.Q_PATTERN = re.compile(r'(?P<query>' + words + r')', re.I)
        self.highlight = 0
        self.pieces = []
        self.node_text = []
        HTMLParser.reset(self)
Beispiel #33
0
	def reset(self,query):

		query=[escape(word) for word in query]
		words='|'.join(query)
		self.Q_PATTERN=re.compile(r'(?P<query>' + words + r')',re.I)
		self.highlight=0
		self.pieces = []
		self.node_text = []
		HTMLParser.reset(self)
Beispiel #34
0
    def _reset(self):
        '''
            _reset - reset this object. Assigned to .reset after __init__ call.
        '''
        HTMLParser.reset(self)

        self.root = None
        self.doctype = None
        self._inTag = []
	def reset(self):
		HTMLParser.reset(self)
		self.Record = False
		self.RecordSub = False
		self.RecordLink = False
		self.Depth = 0
		self.Subscribers = 0
		self.Links = []
		self.LinkNames = []
Beispiel #36
0
 def reset(self):
   HTMLParser.reset(self)
   self.extracting = False
   self.extract_date = False
   self.getdetails = False
   self.getlast = False
   self.st_value = []
   self.st_date = []
   self.st_id = []
   self.st_title = []
Beispiel #37
0
	def getFriends(self):
		for i in range(ord('A'),ord('Z')+1):
			start=chr(i)
			end=start
			print "get own friends start=%s end=%s"%(start,end)
			res=self.browser.open("http://m.facebook.com/friends.php?pa&start=%s&end=%s&refid=5&ref=mfl"%(start,end))
			self.feed(res.read())
			HTMLParser.reset(self)
		print "%d own friends found!"%(len(self.friendslist))
		return self.friendslist
Beispiel #38
0
    def reset(self):
        HTMLParser.reset(self)

        self._buffer = []
        self._tag_stack = []
        self._remove_begining_ws = True
        self._preserve = 0
        self._last_text_idx = -1

        self._backup = []
 def reset(self):
     HTMLParser.reset(self)
     #super(ResilientParser,self).reset()
     self.stack = []
     self.annotated_data = defaultdict(list)
     self.tokens = []
     self.first_pass = True
     self.tag_types = {}
     self.tag_idx = 0
     self.token_idx = 0
 def reset(self):
     HTMLParser.reset(self)
     #super(ResilientParser,self).reset()
     self.stack = []
     self.annotated_data = defaultdict(list)
     self.tokens = []
     self.first_pass = True
     self.tag_types = {}
     self.tag_idx = 0
     self.token_idx = 0
 def reset(self):
     HTMLParser.reset(self)
     self.depth = 0
     self.in_block = None
     self.in_verse = None
     self.in_verse_content = None
     self.block_depth = None
     self.verse_depth = None
     self.content_depth = None
     self.verse_num = None
     self.content_parts = []
Beispiel #42
0
 def reset(self):
     HTMLParser.reset(self)
     self.uid = []
     self.imgs = []
     self.album = ""
     self.info = ""
     self.intro = ""
     self.artist = ""
     self.getIntro = 0
     self.getTitle = 0
     self.getInfo = 0
Beispiel #43
0
 def reset(self):
     HTMLParser.reset(self)
     self.uid = []
     self.imgs = []
     self.album = ""
     self.info = ""
     self.intro = ""
     self.artist = ""
     self.getIntro = 0
     self.getTitle = 0
     self.getInfo = 0
Beispiel #44
0
 def reset(self):
     """
     Initialize the parser state.
     """
     HTMLParser.reset(self)
     self.curr_doc = None
     self.curr_attrs = {}
     self.curr_val = ''
     self.in_head = False
     self.in_body = False
     self.parse_body = True
Beispiel #45
0
 def reset(self):
   HTMLParser.reset(self)
   self._elemStack = []
   self._nextUrls = []
   self._tables = []
   self._table_id = ""
   self._table = None
   self._cell = ""
   self._intable = False
   self._inrow = False
   self._incell = False
Beispiel #46
0
 def reset(self):
     HTMLParser.reset(self)
     self._elemStack = []
     self._nextUrls = []
     self._tables = []
     self._table_id = ""
     self._table = None
     self._cell = ""
     self._intable = False
     self._inrow = False
     self._incell = False
 def reset(self):
     """
     Initialize the parser state.
     """
     HTMLParser.reset(self)
     self.curr_doc   = None
     self.curr_attrs = {}
     self.curr_val   = ''
     self.in_head    = False
     self.in_body    = False
     self.parse_body = True
    def reset(self):
        self.__word_seq = []
        self.__word_tag_seq = []

        self.__chr_seq = []
        self.__chr_tag_seq = []
        self.__chr_space_seq = []

        self.__curr_bio = None
        self.__curr_tag = None
        self.__curr_attrs = None
        HTMLParser.reset(self)
Beispiel #49
0
 def reset(self):
     HTMLParser.reset(self)
     self.topics_flag = False
 	self.places_flag = False
     self.title_flag = False
     self.body_flag = False
     self.topic_d_flag = False
     self.place_d_flag = False
     # List of records: each record represents an article
     self.records_list = []
     self.topics_list = []
     self.places_list = []
Beispiel #50
0
    def reset(self):
        '''Resets the parser.'''

        HTMLParser.reset(self)

        self.__tag_stack = [{
            # Add fake root tag
            'name': None,
            'new_tag_handler': self.handle_root,
            'data_handler': self.handle_root_data,
            'end_tag_handler': self.handle_root_end,
        }]
Beispiel #51
0
    def reset(self):
        self.__word_seq = []
        self.__word_tag_seq = []

        self.__chr_seq = []
        self.__chr_tag_seq = []
        self.__chr_space_seq = []

        self.__curr_bio = None
        self.__curr_tag = None
        self.__curr_attrs = None
        HTMLParser.reset(self)
Beispiel #52
0
    def reset(self):

        HTMLParser.reset(self)
        self.topics_flag = False
        self.places_flag = False
        self.title_flag = False
        self.body_flag = False
        self.topic_d_flag = False
        self.place_d_flag = False
        # List of records: each record represents an article
        self.records_list = []
        self.topics_list = []
        self.places_list = []
 def reset(self):
     self.fed = []
     self.fed_in_section = []
     self.fed_text = None
     self.section_found = False
     self.section_name = False
     self.table_counter = 0
     self.lead_found = False
     self.tracking_link = False
     self.tracking_see_also = False
     self.navbox_counter = 0
     self.in_section = False
     # self.paragraph_found = False
     # self.paragraph_counter = 0
     HTMLParser.reset(self)
Beispiel #54
0
 def reset(self):
     HTMLParser.reset(self)
     self.images = []
     #TODO Alignment not working.
     self.h1 = False
     self.h2 = False
     self.li = False
     self.p = False
     self.a = False
     self.ul = False
     self.ol = False
     self.span = False
     self.reference = False
     self.ref_counter = 0
     self.buffer = None
     self.sup = False
Beispiel #55
0
	def reset(self):
		HTMLParser.reset(self)
		self.tags = set(["publisher","fulltext","document","pub_title","full_text","pub_date","author","abstract","title","document_url","pq_doc_id", "pub_title", "document_url", "abstract", "title","database", "query","pub_date","document_id","source", "script", "p"])
		self.current_tag = "NULL"
		self.end_tag = "NULL"
		self.document_id = "NULL"
		self.publisher = "NULL"
		self.content = ""
		self.pub_title = "NULL"
		self.document_url = "NULL"
		self.abstract = "NULL"
		self.title = "NULL"
		self.database = "NULL"
		self.query = "NULL"
		self.pub_date = "NULL"
		self.document_id = "NULL"
		self.source = "NULL"
		self.counter = 0
		self.p_tag = 0
Beispiel #56
0
    def feed(self, data, did):
        #extracting html text
        try:
            HTMLParser.feed(self, data)
        except:
            sys.exc_clear()

        #tokenizing extracted data
        for line in self.text.splitlines():
            tokens = parse(line, self.config)
            if not tokens: continue
            for token in tokens:
                self.ht.put_data(token, did)
                self.N += 1
        term_count[did] = self.N

        #cleaning for next feed
        self.text = ''
        self.N = 0
        HTMLParser.reset(self)
Beispiel #57
0
 def reset(self):
     HTMLParser.reset(self)
     self.content = ""
     self.signature = ""
     self.reference = ""
     self.picture = ""
     self.btag = 0
     self.count = 0
     self.acount = 0
     self.hasTag = 0
     self.title = ''
     self.topid = 0
     self.aid = 0
     self.author = ''
     self.date = ''
     self.space = 0
     self.contentTag = 0
     self.signTag = 0
     self.refTag = 0
     self.pcount = 0
Beispiel #58
0
    def feed(self, data, did):
        #extracting html text
        try:
            HTMLParser.feed(self, data)
        except:
            sys.exc_clear()

        #tokenizing extracted data
        for line in self.text.splitlines():
            tokens = parse(line, self.config)
            if not tokens: continue
            for i in xrange(len(tokens)):
                self.toks[tokens[i]] += 1
                if i > 0:
                    self.ht[(tokens[i - 1], tokens[i])] += 1
                    self.N += 1

        #cleaning for next feed
        self.text = ''
        self.N = 0
        HTMLParser.reset(self)