Python ParserHelper Examples

Programming Language: Python

Namespace/Package Name: parserhelper

Class/Type: ParserHelper

Examples at hotexamples.com: 12

Python ParserHelper - 12 examples found. These are the top rated real world Python examples of parserhelper.ParserHelper extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

match_first_pattern(6)

extract_tag_text(4)

ParserHelper(3)

clean_data(2)

rectext(2)

date_format_helper(1)

Example #1

Show file

File: tagparser.py Project: bmiller1009/linkedin-miner

    def _get_url(self):

        tag = ph.match_first_pattern(self._html, self._REGEX)
    
        start_index = tag.index("href='/")
        title_index = tag.index("trk=")
        
        #TODO:  Clean this up
        return LINKEDIN_URL + \
            ph.clean_data(tag[start_index + 7:title_index])

Example #2

Show file

File: parser.py Project: patphy/pubmed-patent-parser

 def body(self):
     # author : ajbharani
     # article -> body -> sec(recursive) -> p
     # 'body'
     result = ''
     bodies = self.dom.getElementsByTagName('body')
     for body in bodies:
         ph = ParserHelper()
         ph.rectext(body, 'p')
         result += ph.rtext
     return result

Example #3

Show file

File: parser.py Project: patphy/pubmed-patent-parser

 def abstract(self):
     # author : ajbharani
     # article -> front -> abstract
     # 'abstract'
     result = ''
     abstracts = self.dom.getElementsByTagName('abstract')
     for abstract in abstracts:
         ph = ParserHelper()
         ph.rectext(abstract, 'p')
         result += ph.rtext
     return result

Example #4

Show file

File: parser.py Project: srajasekar/pubmed-patent-parser

	def body(self):
		# author : ajbharani
		# article -> body -> sec(recursive) -> p
		# 'body'
		result = ''
		bodies = self.dom.getElementsByTagName('body')
		for body in bodies:
			ph = ParserHelper()
			ph.rectext(body,'p')
			result += ph.rtext
		return result

Example #5

Show file

File: parser.py Project: srajasekar/pubmed-patent-parser

	def abstract(self):
		# author : ajbharani
		# article -> front -> abstract
		# 'abstract'
		result = ''
		abstracts = self.dom.getElementsByTagName('abstract')
		for abstract in abstracts:
			ph = ParserHelper()
			ph.rectext(abstract,'p')
			result += ph.rtext
		return result

Example #6

Show file

File: parser.py Project: patphy/pubmed-patent-parser

 def pubdates(self):
     # author : saranya
     # article -> front -> article-meta -> pub-date
     # [date1, date2]
     # date:
     # {'pub-type':'val','pub-date':'yyyy-mm-dd'}
     ph = ParserHelper()
     result = []
     startTag = self.dom.getElementsByTagName('front')
     for front in startTag:
         for tagsInFront in front.childNodes:
             if tagsInFront.nodeName == 'article-meta':
                 for articleMetaTags in tagsInFront.childNodes:
                     if articleMetaTags.nodeName == 'pub-date':
                         date = dict()
                         datestr = ''
                         month = '01'
                         day = '01'
                         year = '1900'
                         date['pub-type'] = articleMetaTags.getAttribute(
                             'pub-type')
                         for tagsInPubDate in articleMetaTags.childNodes:
                             if tagsInPubDate.nodeName == 'month':
                                 try:
                                     month = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     month = '01'
                                 month = ph.date_format_helper(month)
                             if tagsInPubDate.nodeName == 'day':
                                 try:
                                     day = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     day = '01'
                                 day = ph.date_format_helper(day)
                             if tagsInPubDate.nodeName == 'year':
                                 try:
                                     year = tagsInPubDate.firstChild.data
                                 except AttributeError:
                                     year = '1900'
                         datestr += year + '-'
                         datestr += month + '-'
                         datestr += day
                         date['pub-date'] = datestr
                         result.append(date)
     return result

Example #7

Show file

File: parser.py Project: srajasekar/pubmed-patent-parser

	def pubdates(self):
		# author : saranya
		# article -> front -> article-meta -> pub-date
		# [date1, date2]
		# date:
		# {'pub-type':'val','pub-date':'yyyy-mm-dd'}
		ph = ParserHelper()
		result = []		
		startTag = self.dom.getElementsByTagName('front')
		for front in startTag:
			for tagsInFront in front.childNodes:
				if tagsInFront.nodeName == 'article-meta':
					for articleMetaTags in tagsInFront.childNodes:
						if articleMetaTags.nodeName == 'pub-date':
							date = dict()
							datestr = '' 
							month = '01'
							day = '01'
							year = '1900'
							date['pub-type'] = articleMetaTags.getAttribute('pub-type')
							for tagsInPubDate in articleMetaTags.childNodes:
								if tagsInPubDate.nodeName == 'month':
									try:
										month = tagsInPubDate.firstChild.data
									except AttributeError:
										month = '01'
									month = ph.date_format_helper(month)
								if tagsInPubDate.nodeName == 'day':
									try:
										day = tagsInPubDate.firstChild.data
									except AttributeError:
										day = '01'
									day = ph.date_format_helper(day)
								if tagsInPubDate.nodeName == 'year':
									try:
										year = tagsInPubDate.firstChild.data
									except AttributeError:
										year = '1900'
							datestr += year + '-'
							datestr += month + '-'
							datestr += day
							date['pub-date'] = datestr
							result.append(date)
		return result

Example #8

Show file

File: tagparser.py Project: bmiller1009/linkedin-miner

    def _profile_url(self):
        
        pattern = "<a href=(.*)/profile[^>]*>"
        url = "{0}profile/view?id=".format(LINKEDIN_URL)
        tag = ph.match_first_pattern(self._html, pattern)

        index_of_string = ''
        title_string = ''

        if tag.find("view?id"):
            index_of_string = "id="
            title_string = "&amp;authType"
        elif tag.find("viewProfile=&amp"):
            index_of_string = "key="
            title_string = "&amp;authToken"

        offset = len(index_of_string)

        start_index = offset + tag.index(index_of_string)
        title_index = tag.index(title_string)
        
        return url + ph.clean_data(tag[start_index:title_index])

Example #9

Show file

File: tagparser.py Project: bmiller1009/linkedin-miner

 def _name(self):
     
     pattern = " title='View profile'>.*</a>"
     tag = ph.match_first_pattern(self._html, pattern)
     return ph.extract_tag_text(tag)

Example #10

Show file

File: tagparser.py Project: bmiller1009/linkedin-miner

    def _get_description(self): 

        tag = ph.match_first_pattern(self._html, self._REGEX)
        return ph.extract_tag_text(tag)

Example #11

Show file

File: tagparser.py Project: bmiller1009/linkedin-miner

 def _extract_metric(self, pattern_value):
     pattern = "<span class='{0}'>.*</span>".format(pattern_value)
     tag = ph.match_first_pattern(self._html, pattern)
     return ph.extract_tag_text(tag)

Example #12

Show file

File: tagparser.py Project: bmiller1009/linkedin-miner

 def _job_title(self):
     
     pattern = "<dd class='title'>.*</dd>"
     tag = ph.match_first_pattern(self._html, pattern)
     return ph.extract_tag_text(tag)