Exemple #1
0
 def doWeb(self, doc, url):
     if type(doc) == type("huh"):  #then it's not BeautifulSoup
         tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser())
         links = tree.xpath(
             "/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']"
         )
         #print "links = ", links
         #for each in links:
         #    print type(links[0])
         document = BSXPathEvaluator(doc)
     else:
         document = doc
     if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]",
                          document, None, XPathResult.ANY_TYPE, None):
         articles = []
         if (self.detectWeb(doc, url) == "multiple"):
             #search page
             items = {}
             xpath = None
             if (url.count("_ob=PublicationURL") > 0):
                 xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a'
             else:
                 xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a'
             rows = document.evaluate(xpath, document, None,
                                      XPathResult.ANY_TYPE, None)
             print rows
             next_row = None
             #for next_row in rows.iterateNext():
             isTrue = True
             next_row = rows
             while isTrue:
                 try:
                     next_row = rows.iterateNext()
                 except IndexError:
                     isTrue = False
                 #while (next_row = rows.iterateNext()):
                 print next_row.__dict__
                 title = "some title here"  #next_row.text
                 link = "some href here"  #next_row.href
                 if not re.match("PDF \(", title) and not re.match(
                         "Related Articles", title):
                     items[link] = title
             #items = zotero.SelectItems(items)
             #let's assume we want all of them
             [articles.append(i) for i in items]
             result_sets = []
             for article in articles:
                 result_sets.append({'article': article})
         else:
             articles = [url]
             return_sets = [{"currentdoc": doc}]
         if len(articles) == 0:
             print "ERROR: no items were found"
             return
         print "articles = ", articles
         print "result_sets = ", result_sets
     return result_sets  #return all articles or the currentdoc in a dict for stuff that we want to grab
Exemple #2
0
def getinfo(url,html):
  document = BSXPathEvaluator(html)
  setting={}
  setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页')]"
  setting['title_xpath']="//title"
  next_link = document.getFirstItem(setting['next_xpath'])['href']#获取下一页URL
  next_url=urlparse.urljoin(url,next_link)#修正为绝对URL
  title= document.getFirstItem(setting['title_xpath']).string
  #site=root=urlparse.urlparse(url).netloc
  return title,next_url
Exemple #3
0
 def get(self):
     from google.appengine.api import urlfetch
     from BeautifulSoup import BeautifulSoup
     from BSXPath import BSXPathEvaluator,XPathResult
     result = urlfetch.fetch(url="http://www.u17.com/comic_list/le!_th99_gr99_ca99_ss99_ob0_m0_p1.html",
                             headers={'dd': 'dd'})
     if(result.status_code == 200):
         doc = BSXPathEvaluator(result.content)#/OL[20]/DIV[1]/A[1]/IMG[1]
         r = doc.getFirstItem('/html[1]/BODY[1]/DIV[8]/DIV[3]/DIV[2]/DIV[12]')
         self.response.out.write(r)
Exemple #4
0
def getinfo(url,html):
  document = BSXPathEvaluator(html)
  setting={}
  setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页') or contains(text(),'下一节')]"
  setting['title_xpath']="//title"
  title= ''+document.getFirstItem(setting['title_xpath']).string
  next_link = document.getItemList(setting['next_xpath'])
  if len(next_link)==0:
    return title,None
    pass
  next_url=urlparse.urljoin(url,next_link[0]['href'])#修正为绝对URL
  #site=root=urlparse.urlparse(url).netloc
  return title,next_url
Exemple #5
0
 def get_error_apache(self, document):
     ### init
     apache_error = ''
     ### xpath terms
     xpath_apache_error = './/title/text()'
     ### get the title and description
     BSXdocument = BSXPathEvaluator(document)
     err = BSXdocument.getItemList(xpath_apache_error)
     if len(err) > 0:
         apache_error = '%s' % err[0]
     ### cleanup
     del err
     del BSXdocument
     ### return error title and description
     return apache_error
Exemple #6
0
 def doWeb(self, doc, url):
     if type(doc) == type("huh"): #then it's not BeautifulSoup
         tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser())
         links = tree.xpath("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']")
         #print "links = ", links
         #for each in links:
         #    print type(links[0])
         document = BSXPathEvaluator(doc)
     else: document = doc
     if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None):
         articles = []
         if (self.detectWeb(doc, url) == "multiple"):
             #search page
             items = {}
             xpath = None
             if (url.count("_ob=PublicationURL") > 0):
                 xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a'
             else:
                 xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a'
             rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None)
             print rows
             next_row = None
             #for next_row in rows.iterateNext():
             isTrue = True
             next_row = rows
             while isTrue:
                 try: next_row=rows.iterateNext()
                 except IndexError: isTrue=False
                 #while (next_row = rows.iterateNext()):
                 print next_row.__dict__
                 title = "some title here" #next_row.text
                 link = "some href here" #next_row.href
                 if not re.match("PDF \(",title) and not re.match("Related Articles",title): items[link] = title;
             #items = zotero.SelectItems(items)
             #let's assume we want all of them
             [articles.append(i) for i in items]
             result_sets = []
             for article in articles:
                 result_sets.append({'article':article})
         else:
             articles = [url]
             return_sets = [{"currentdoc":doc}]
         if len(articles) == 0:
             print "ERROR: no items were found"
             return
         print "articles = ", articles
         print "result_sets = ", result_sets
     return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab
Exemple #7
0
 def get_error_from_django(self, document):
     ### init
     error_title = error_description = ''
     ### xpath terms
     xpath_title = './/div[@id="summary"]/h1/text()'
     xpath_description = './/div[@id="summary"]/pre/text()'
     ### get the title and description
     BSXdocument = BSXPathEvaluator(document)
     title = BSXdocument.getItemList(xpath_title)
     if len(title) > 0:
         error_title = '%s' % title[0]
     description = BSXdocument.getItemList(xpath_description)
     if len(description) > 0:
         error_description = '%s' % description[0]
     ### cleanup
     del title
     del description
     del BSXdocument
     ### return error title and description
     return (error_title, error_description)
Exemple #8
0
 def detectWeb(self, doc, url):
     if type(doc) == type(""):
         doc = BSXPathEvaluator(doc)
     if url.count(
             "_ob=DownloadURL") != 0 or doc.title == "ScienceDirect Login":
         return False
     if ((not re.match("pdf", url)) and url.count("_ob=ArticleURL") == 0 and
             url.count("/article/") == 0) or url.count("/journal/") != 0:
         return "multiple"
     elif not re.match("pdf", url):
         return "journalArticle"
     return False
def parse_catalog(catalog_url, parser):

    fetch_result = urlfetch.fetch(catalog_url, allow_truncated=True)
    html = fetch_result.content.decode(parser.site_coding, "ignore")
    document = BSXPathEvaluator(html, convertEntities=BeautifulSoup.HTML_ENTITIES)  # 转换实体字符

    parse_result = {}

    vol_list = document.getItemList(parser.vol_and_chapter_xpath)

    chapter_url_list = []
    chapter_title_list = []

    if parser.url_remove_prefix_re:  # 加速,下面要重复使用
        url_remove_prefix_re = re.compile(parser.url_remove_prefix_re)

    for i in vol_list:
        if i.name != "a":
            # 判断是否解析到了VIP卷
            if not parser.vol_vip_string or unicode(i).find(parser.vol_vip_string) == -1:
                chapter_url_list.append("")  # 数据库的列表不能保存None
                chapter_title_list.append(get_all_contents(i))
            else:
                chapter_url_list.append("")  # 数据库的列表不能保存None
                chapter_title_list.append(parser.vol_vip_string)
                break
        else:
            url = i["href"]
            if parser.url_remove_prefix_re:
                url = url_remove_prefix_re.sub("", url)
            chapter_url_list.append(url)
            chapter_title_list.append(get_all_contents(i))

    put_into_dict(parse_result, "chapter_url_list", chapter_url_list)
    put_into_dict(parse_result, "chapter_title_list", chapter_title_list)

    return parse_result
def parse_document(document):
    BSXdocument = BSXPathEvaluator(document)
    
    XPath_table = './/*[@id="main"]/p[2]/table'
    XPath_table_body = '%s/tbody' % (XPath_table)
    XPath_table_header = '%s/tr[1]' % (XPath_table_body)
    XPath_table_lines = '%s/tr' % (XPath_table_body)
    rows = BSXdocument.getItemList(XPath_table_lines)[1:]
    
    for row_counter in xrange(len(rows)):
        row = rows[row_counter]
        # print row
        # print "======"
        rowDoc = BSXPathEvaluator('%s'%row)

        XPath_table_row = '/'

        XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1)
        cell_category = rowDoc.getFirstItem(XPath_table_row_cell_category)
        
        XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2)
        cell_type = rowDoc.getFirstItem(XPath_table_row_cell_type)

        XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3)
        cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time)

        XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4)
        cell_level = rowDoc.getFirstItem(XPath_table_row_cell_level)

        XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5)
        cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message)
        
        print "======", row_counter, "======"
        print "Category:",cell_category
        print "Type:",cell_type
        print "Time:",cell_time
        print "Level:",cell_level
        print "Message:",cell_message
        
    return rows
def parse_document(document):
    BSXdocument = BSXPathEvaluator(document)
    
    XPath_table = './/*[@id="main"]/p[2]/table'
    XPath_table_body = '%s/tbody' % (XPath_table)
    XPath_table_header = '%s/tr[1]' % (XPath_table_body)
    XPath_table_lines = '%s/tr' % (XPath_table_body)
    rows = BSXdocument.getItemList(XPath_table_lines)[1:]
    
    records = []
    
    for row_counter in xrange(len(rows)):
        record = ()
        SHIFT=0
        
        
        row = rows[row_counter]
        XPath_table_row = '%s/tr[%d]' % (XPath_table_body, row_counter+1)
        
        XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1)
        cell_category = BSXdocument.getItemList(XPath_table_row_cell_category)
        if len(cell_category)>0:
            cell_category = cell_category[0]
        
        XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2)
        cell_type = BSXdocument.getItemList(XPath_table_row_cell_type)
        if len(cell_type)>0:
            cell_type = cell_type[0]
        
        XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3)
        cell_time = BSXdocument.getItemList(XPath_table_row_cell_time)
        if len(cell_time)>0:
            cell_time = cell_time[0]
        
        XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4)
        cell_level = BSXdocument.getItemList(XPath_table_row_cell_level)
        if len(cell_level)>0:
            cell_level = cell_level[0]
        
        XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5)
        cell_message = BSXdocument.getItemList(XPath_table_row_cell_message)
        if len(cell_message)>0:
            cell_message = cell_message[0]
        
        
        message_category=""
        message_date=""
        message_time=""
        message_dataset=""
        message_site="no.site"
        message_reason="no.reason"
        message_weight="no.weight"
        message_weight_val=0
        message_weight_0=0
        message_weight_1=0
        message_weight_2=0
        message_weight_3=0
        message_weight_4=0
        message_weight_5=0
        message_treshold="no.treshold"
        message_treshold_current=0
        message_treshold_expected=0
        
        ###print;print;print
        ###print u'DEBUG: ln113: cell_message=', cell_message
        
        ## SKIPPED
        if is_this_category(cell_message, ' - action=SKIPPED '):
            tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';')
            #print  u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message
            ###print u'DEBUG ln123: tmp_message=', tmp_message
            message_category="SKIPPED"
            message_date=tmp_message[0]
            message_time=tmp_message[1]
            message_dataset=tmp_message[5].split('=')[1]
            message_reason=tmp_message[4].split('=')[1]
            ### SKIPPED_REASONS=['TOO_MANY_T2_REPLICAS', 'TOO_MANY_T2_SUBSCRIPTIONS']
            if message_reason=="TOO_MANY_T2_REPLICAS":
                try:
                    #message_treshold_current=re.sub(r"[)(>]", '', re.sub("&gt", '', str(tmp_message[13])  ) ).split('=')[0]
                    #message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[14]) ).split('=')[1]
                    message_treshold_current=re.sub(r"[)(>]", '', str(tmp_message[13]) ).split('=')[0]
                    message_treshold_expected=re.sub(r"[)(>]", '', str(tmp_message[13]) ).split('=')[1]
                except: 
                    message_treshold_current=-1
                    message_treshold_expected=-1
                    #print  u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message
            elif message_reason=="TOO_MANY_T2_SUBSCRIPTIONS": 
                try:
                    #message_treshold_current=re.sub(r"[)(>]", '', re.sub("&gt", '', str(tmp_message[11])  ) ).split('=')[0]
                    #message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[12]) ).split('=')[1]
                    message_treshold_current=re.sub(r"[)(>]", '', str(tmp_message[12]) ).split('=')[0]
                    message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[12]) ).split('=')[1]
                except: 
                    message_treshold_current=-1
                    message_treshold_expected=-1
                    #print  u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message
            #message_treshold=tmp_message[13]
            #print u'row:', row_counter, message_treshold, re.sub(r"[)(]", '', str(message_treshold) )
            #message_treshold_current=re.sub(r"[)(>]", '', re.sub("&gt", '', str(tmp_message[13])  ) ).split('=')[0]
            #message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[13]) ).split('=')[1]
            #message_weight=0
            #message_weight_val=0.0
            #message_weight_0=0
            #message_weight_1=0
            #message_weight_2=0
            #message_weight_3=0
            #message_weight_4=0
            #message_weight_5=0
            #print u'test', message_date, message_time, message_dataset, message_reason, message_treshold_current, message_treshold_expected
            #print u'test::', tmp_message
        ## triggered
        if is_this_category(cell_message, ' - triggered '):
            tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';')
            ###print u'DEBUG ln166: tmp_message=', tmp_message
            message_category="triggered"
            message_date=tmp_message[0]
            message_time=tmp_message[1]
            message_dataset=tmp_message[6]
            #message_weight=0
            #message_weight_val=0.0
            #message_weight_0=0
            #message_weight_1=0
            #message_weight_2=0
            #message_weight_3=0
            #message_weight_4=0
            #message_weight_5=0
            #message_treshold=""
            #message_treshold_current=0
            #message_treshold_expected=0
            ###print u'test', message_date, message_time, message_dataset
        ## UNSELECTEDT2
        if is_this_category(cell_message, ' - action=UNSELECTEDT2 '):
            tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';')
            ###print u'DEBUG ln184: tmp_message=', tmp_message
            message_category="UNSELECTED"
            message_date=tmp_message[0]
            message_time=tmp_message[1]
            try:
                message_dataset=tmp_message[6].split('=')[1]
            except IndexError:
                dataset_field = ""
                for tmp_item in tmp_message: 
                    if re.search('^dataset=', tmp_item):
                        message_dataset=tmp_item.split('=')[1]
                        break
            message_site=tmp_message[4].split('=')[1]
            message_weight=tmp_message[5].split('=')[1]
            if message_weight == WEIGHT_NA_STRING:
                message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=WEIGHT_NA_VALUE
                message_weight_val=WEIGHT_NA_VALUE
            else:
                #message_weight_val=eval(float(message_weight)*1.0)
                message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';')
                message_weight_0=message_weight_params[0]
                message_weight_1=message_weight_params[1]
                message_weight_2=message_weight_params[2]
                message_weight_3=message_weight_params[3]
                message_weight_4=message_weight_params[4]
                message_weight_5=message_weight_params[5]
                try:
                    message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5)
                except:
                    message_weight_val=-1
                    #print  u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message
            #message_treshold=""
            #message_treshold_current=0
            #message_treshold_expected=0
            ###print u'test', message_date, message_time, message_dataset, message_site, message_weight, message_weight_0, message_weight_1, message_weight_2, message_weight_3, message_weight_4, message_weight_5
        ## SELECTEDT1
        if is_this_category(cell_message, ' - action=SELECTEDT1 '):
            tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';')
            ###print u'DEBUG ln213: tmp_message=', tmp_message
            message_category="SELECTEDT1"
            message_date=tmp_message[0]
            message_time=tmp_message[1]
            #message_dataset=tmp_message[6].split('=')[1]
            try:
                message_dataset=tmp_message[6].split('=')[1]
            except IndexError:
                dataset_field = ""
                for tmp_item in tmp_message: 
                    if re.search('^dataset=', tmp_item):
                        message_dataset=tmp_item.split('=')[1]
                        break
            message_site=tmp_message[4].split('=')[1]
            #message_weight=tmp_message[5].split('=')[1]
            #message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';')
            #print u'DEBUG ln246: message_weight_params=', message_weight_params
            #message_weight_0=message_weight_params[0]
            #message_weight_1=message_weight_params[1]
            #message_weight_2=message_weight_params[2]
            #message_weight_3=message_weight_params[3]
            #message_weight_4=message_weight_params[4]
            #message_weight_5=message_weight_params[5]
            #try:
            #    message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5)
            #except:
            #    message_weight_val=-1
            message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=message_weight_val=WEIGHT_T1_VALUE
        ## SELECTEDT2
        if is_this_category(cell_message, ' - action=SELECTEDT2 '):
            tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';')
            ###print u'DEBUG ln213: tmp_message=', tmp_message
            message_category="SELECTEDT2"
            message_date=tmp_message[0]
            message_time=tmp_message[1]
            #message_dataset=tmp_message[6].split('=')[1]
            try:
                message_dataset=tmp_message[6].split('=')[1]
            except IndexError:
                dataset_field = ""
                for tmp_item in tmp_message: 
                    if re.search('^dataset=', tmp_item):
                        message_dataset=tmp_item.split('=')[1]
                        break
            message_site=tmp_message[4].split('=')[1]
            message_weight=tmp_message[5].split('=')[1]
            message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';')
            message_weight_0=message_weight_params[0]
            message_weight_1=message_weight_params[1]
            message_weight_2=message_weight_params[2]
            message_weight_3=message_weight_params[3]
            message_weight_4=message_weight_params[4]
            message_weight_5=message_weight_params[5]
            try:
                message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5)
            except:
                message_weight_val=-1
        ## SELECTEDT2_T1MOU
        if is_this_category(cell_message, ' - action=SELECTEDT2_T1MOU '):
            tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';')
            ###print u'DEBUG ln213: tmp_message=', tmp_message
            message_category="SELECTEDT2_T1MOU"
            message_date=tmp_message[0]
            message_time=tmp_message[1]
            #message_dataset=tmp_message[6].split('=')[1]
            try:
                message_dataset=tmp_message[6].split('=')[1]
            except IndexError:
                dataset_field = ""
                for tmp_item in tmp_message: 
                    if re.search('^dataset=', tmp_item):
                        message_dataset=tmp_item.split('=')[1]
                        break
            message_site=tmp_message[4].split('=')[1]
            #message_weight=tmp_message[5].split('=')[1]
            #message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';')
            #message_weight_0=message_weight_params[0]
            #message_weight_1=message_weight_params[1]
            #message_weight_2=message_weight_params[2]
            #message_weight_3=message_weight_params[3]
            #message_weight_4=message_weight_params[4]
            #message_weight_5=message_weight_params[5]
            #try:
            #    message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5)
            #except:
            #    message_weight_val=-1
            ###message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';')
            message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=message_weight_val=WEIGHT_T2_T1MOU_VALUE
        ## SELECTEDT2_T2MOU
        if is_this_category(cell_message, ' - action=SELECTEDT2_T2MOU '):
            tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';')
            ###print u'DEBUG ln213: tmp_message=', tmp_message
            message_category="SELECTEDT2_T2MOU"
            message_date=tmp_message[0]
            message_time=tmp_message[1]
            #message_dataset=tmp_message[6].split('=')[1]
            try:
                message_dataset=tmp_message[6].split('=')[1]
            except IndexError:
                dataset_field = ""
                for tmp_item in tmp_message: 
                    if re.search('^dataset=', tmp_item):
                        message_dataset=tmp_item.split('=')[1]
                        break
            message_site=tmp_message[4].split('=')[1]
            #message_weight=tmp_message[5].split('=')[1]
            #message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';')
            #message_weight_0=message_weight_params[0]
            #message_weight_1=message_weight_params[1]
            #message_weight_2=message_weight_params[2]
            #message_weight_3=message_weight_params[3]
            #message_weight_4=message_weight_params[4]
            #message_weight_5=message_weight_params[5]
            #try:
            #    message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5)
            #except:
            #    message_weight_val=-1
            message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=message_weight_val=WEIGHT_T2_T2MOU_VALUE
        #print u'DEBUG::      message:', cell_message
        #print u'============='
        
        record = (message_date, message_time, message_category, message_dataset, message_site, \
                  message_reason, message_weight, \
                  message_weight_val, message_weight_0, message_weight_1, \
                  message_weight_2, message_weight_3, \
                  message_weight_4, message_weight_5, \
                  message_treshold_current, message_treshold_expected
                  )
        records.append(record)
        
    return records
def parse_document(document):
    BSXdocument = BSXPathEvaluator(document)
    
    XPath_table = './/*[@id="main"]/p[2]/table'
    XPath_table_body = '%s/tbody' % (XPath_table)
    XPath_table_header = '%s/tr[1]' % (XPath_table_body)
    XPath_table_lines = '%s/tr' % (XPath_table_body)
    rows = BSXdocument.getItemList(XPath_table_lines)[1:]
    # get cloud name
    fjson = open('panda_queues.json','r')
    data = fjson.read()
    dic = json.loads(data)
    fjson.close()
    
    records = []
    ex_record = []
    exist_records = []
    in_buf_records = []
    maxId = db.get_max_id()
    last_time = db.get_last_updated_time()
    if last_time is None:
        db.first_last_updated_time()
        last_time = db.get_last_updated_time()
    this_time = None
    skip_time = None
    set_last = None
    this_year = datetime.date.today().year
    if maxId is None:
        maxId = 0
    processed_rows = 0
    
    for row_counter in xrange(len(rows)):
        record = ()
        ex_rec = ()
        SHIFT=0
        
        row = rows[row_counter]

        rowDoc = BSXPathEvaluator('%s'%row)

        #XPath_table_row = '%s/tr[%d]' % (XPath_table_body, row_counter+1)
        XPath_table_row = '/'
        """
        XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1)
        cell_category = BSXdocument.getItemList(XPath_table_row_cell_category)
        if len(cell_category)>0:
            cell_category = cell_category[0]
        
        XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2)
        cell_type = BSXdocument.getItemList(XPath_table_row_cell_type)
        if len(cell_type)>0:
            cell_type = cell_type[0]
        """
        XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3)
        cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time)
        #if len(cell_time)>0:
            #cell_time = cell_time[0]
        """
        XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4)
        cell_level = BSXdocument.getItemList(XPath_table_row_cell_level)
        if len(cell_level)>0:
            cell_level = cell_level[0]
        """
        XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5)
        cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message)
        #if len(cell_message)>0:
            #cell_message = cell_message[0]
        
        
        message_category="no.category"
        message_date = ""
        message_time = ""
        message_dn = ""
        message_jobset ="no.jobset"
        message_jobdef = "no.jobdef"
        message_action = ""
        message_site="no.site"
        message_reason="no.reason"
        message_weight="no.weight"
        
        message_datetime = str(cell_time).split(' ')
        message_date = message_datetime[0].strip()
        message_time = message_datetime[1].strip()
        
        # Skip the leading uncompleted minute
        log_year = get_log_year(this_year, message_date, message_time)
        this_time = "%s-%s %s"%(log_year, message_date, message_time)
        
        if skip_time is None or skip_time == this_time:
            skip_time = this_time
            continue
        # set the last updated time when skip done.( Records in time DESC )
        if set_last is None:
            # save it when every thing done
            set_last = this_time
        
        # Break when reach the last_time
        if (last_time is not None) and (this_time <= last_time):
            break
               
        # print 'Debug:',message_date,message_time,row_counter,cell_message
        processed_rows += 1
        
        tmp_message = str(cell_message.replace('&nbsp;', ' ')).split(' : ')
        message_dn = tmp_message[0].split('=')[1].replace("\\\'","").strip().replace(' ','_')
        tmp_job = tmp_message[1].split(' ')
        if len(tmp_job) > 1:
            message_jobset = tmp_job[0].split('=')[1].strip() 
            message_jobdef = tmp_job[1].split('=')[1].strip()
        else:
            if is_this_category(tmp_job[0],'jobset'):
                message_jobset = tmp_job[0].split('=')[1].strip()
            if is_this_category(tmp_job[0],'jobdef'):
                message_jobdef = tmp_job[0].split('=')[1].strip()
        ###print;print;print
        #print u'DEBUG: date time=', message_date, message_time
        #print u'DEBUG: dn=', message_dn
        #print u'DEBUG: jobset=', message_jobset
        #print u'DEBUG: jobdef=', message_jobdef
        #print u'DEBUG: ln113: tmp_message[1]=', tmp_message[1]
        #print u'DEBUG: ln113: tmp_message[2]=', tmp_message[2]
        
        ## skip
        if is_this_category(cell_message, ' action=skip '):
            # continue # try to speed up
            message_category = "D"
            message_skip = tmp_message[2].split(' ')
            message_action = message_skip[0].split('=')[1].strip()
            message_site = message_skip[1].split('=')[1].strip()
            message_reason = message_skip[2].split('=')[1].strip()
            if re.search('=',message_skip[4]):
                message_weight = message_skip[4].split('=')[1].strip()
            else:
                message_reason = '_'.join(message_skip[3:]).strip('_')
        
        # exclude : add at 2011-10-26
        elif is_this_category(cell_message, ' action=exclude '):
            message_category = "E"
            message_skip = tmp_message[2].split(' ')
            message_action = message_skip[0].split('=')[1].strip()
            message_site = message_skip[1].split('=')[1].strip()
            message_reason = message_skip[2].split('=')[1].strip()
            if re.search('=',message_skip[4]):
                message_weight = message_skip[4].split('=')[1].strip()
            else:
                message_reason = '_'.join(message_skip[3:]).strip('_')
            site_name,cloud = get_sitecloud_name(dic,message_site)
            if is_excluded(ex_record,message_dn,message_jobset,site_name):
                message_category = "D" # skip if excluded by other jobdef of same jobset
            else:
                ex_rec = (message_dn, message_jobset, site_name)
                ex_record.insert(0, ex_rec)
        
        ## choose
        elif is_this_category(cell_message, ' action=choose '):
            message_category = "C"
            message_choose = tmp_message[2].split(' ')
            message_action = message_choose[0].split('=')[1].strip()
            message_site = message_choose[1].split('=')[1].strip()
            message_reason = message_choose[2].split('=')[1].strip()
            if re.search('=',message_choose[5]):
                message_weight = message_choose[5].split('=')[1].strip()
            else:
                message_reason = '_'.join(message_choose[3:]).strip('_')
        
        ## action=use: add at 2011-10-26
        elif is_this_category(cell_message, ' action=use '):
            #message_category = "C"
            message_choose = tmp_message[2].split(' ')
            message_action = message_choose[0].split('=')[1].strip()
            message_site = message_choose[1].split('=')[1].strip()
            # message_reason = message_choose[2].split('=')[1].strip()
            message_reason = '_'.join(message_choose[3:]).strip('_')
            if is_this_category(message_reason, 'site'):
                message_category = "A"
            if is_this_category(message_reason, 'cloud'):
                message_category = "B"
        
        ## use site or cloud
        elif is_this_category(cell_message, ' use '):
            message_use = tmp_message[2].split(' ')
            message_action = message_use[0].strip()
            message_site = message_use[1].strip()
            message_reason = '_'.join(message_use[3:]).strip('_')
            if is_this_category(message_reason, 'site'):
                message_category = "A"
            if is_this_category(message_reason, 'cloud'):
                message_category = "B"
                
        ## other actions
        elif is_this_category(cell_message, ' action='):
            message_buf = tmp_message[2].split(' ')
            message_action = message_buf[0].split('=')[1].strip()
            print "WARNING: action=%s is not processed!"%message_action
        
        ## append to records it belong to
        if message_category in ['A','B','C','E']:
            logDate = str("%s-%s"%(log_year, message_date))
            rec_idx = None
            site_name,cloud = get_sitecloud_name(dic,message_site)
            dailyLogId = db.is_exist_item(logDate, message_category, site_name, message_dn)
            if dailyLogId is None:
                rec_idx = is_in_buf(records, logDate, message_category, site_name, message_dn)
                
            if dailyLogId is not None:
                exist_records.append([dailyLogId])
            elif rec_idx is not None:
                record = (logDate, message_category, site_name, message_dn)
                in_buf_records.append(record)
            else:
                maxId += 1
                count = 1               
                record = (maxId, logDate, message_category, site_name, \
                  cloud, message_dn, count)
                records.append(record)
        
        if DEBUG==1:
            print "========="
            print "DEBUG:",message_category,": ",row
            print "========="

    db.set_last_updated_time(set_last) # set when all done.
    if (this_time is not None) and not (this_time <= last_time):
        print "Error: === NOT Reach the last updated time (%s -> %s) ==="%(this_time,last_time)
        
    return (processed_rows,records, exist_records, in_buf_records)
Exemple #13
0
def test():
  global document,options,DEFAULT_TESTDIR,url_data
  
  def nodesStr(nodes):
    def tagstr(node):
      try:
        strs=['<'+node.name]
        i=node.get('id')
        c=node.get('class')
        if i:
          strs.append('id='+i)
        if c:
          strs.append('class='+c)
        return escapeStr(' '.join(strs)+'>')
      except:
        return escapeStr(unicode(node))
    
    if isinstance(nodes,list):
      return ' '.join([tagstr(node) for node in nodes])
    elif getattr(nodes,'nodeType',None) or isinstance(nodes,basestring):
      return escapeStr(unicode(nodes))
    else:
      return nodes
  
  if options.web:
    fp=urllib2.urlopen(url_data)
    dirdoc=BSXPathEvaluator(fp.read())
    files=map(lambda node:node.get('href'),dirdoc.getItemList('//li/a[@href!="../"]'))
  else:
    if options.path:
      testdir=options.path
    else:
      testdir=DEFAULT_TESTDIR
    files=os.listdir(testdir)
  
  tnames=','.join(options.names).split(',') if options.names else None
  tnumbers=','.join(options.numbers).split(',') if options.numbers else None
  for name in files:
    if tnames:
      fname=re.sub(r'\..*$','',name)
      if not fname in tnames: continue
    target=url_data+'/'+name if options.web else os.path.join(testdir,name)
    data=parseTestData(target,options.web)
    print '[%s]\n%s\n' % (name,data.comment)
    document=BSXPathEvaluator(data.html)
    context=document.evaluate(data.contextExpr,document,None,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,None).snapshotItem(0)
    tests=data.tests
    cnt=0
    for test in tests:
      cnt=cnt+1
      if tnumbers:
        if not str(cnt) in tnumbers: continue
      print u'No.%d' % cnt
      expr=test.expr
      print u'expr  : %s' % (expr)
      
      (nodes,time,resultType)=document.applyXPath(context,expr)
      
      print u'time  : %d.%06d sec' % (time.seconds,time.microseconds)
      print u'result: %s' % nodesStr(nodes)
      print u'expect: %s' % (test.data)
      
      judge=testNodes(nodes,test.data)
      
      print u'judge : %s (%s)' % (judge.status,judge.detail)
      print u''
    
    print u''
def collectFriendsEmails():
    """collectFriendsEmails()
        uses official facebook api to get list of friends
        uses list of friends to manually access info page of each
        saves each contact information in csv
    """
    global usr, debug, browser, debug
    startTime = time.time() #save current time for calculation of elapsed time

    logger.info("%s launching CONTACT-DATA COLLECTION" % stages[2])


    try:#get access token
        res = browser.open('http://developers.facebook.com/docs/reference/api')
        html = res.read()

        if debug: print "%s fetching access token..." % stages[2]
        if debug:open('referenceAPI','w').write(BeautifulSoup(html).prettify())

        match = re.search('access_token=(.*?)"', html)
        acc = match.group(1)

        if debug: print 'access token: ' + acc

        #get friends
        res = browser.open('https://graph.facebook.com/me/friends?access_token=%s' % acc)
        html = res.read()
        friends = json.loads(html)
    except Exception as e:
        logger.error("%s could not get list of friends. Are you executing multiple instances with these credentials?: %s"%(stages[2],str(e)))
        if debug: print sys.exc_info()
        return

    #create csv writer
    f = open('%s.csv' % usr, 'ab')
    writer = UnicodeWriter(f)

    #writer = csv.writer(open('%s.csv' % usr, 'ab'), delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    #logger.info('%s******************LIST OF CONTACTS******************' %stages[2])

    for acc in friends['data']: #for each dataset in JSON data
        friend_id = acc['id']
        friend_name = acc['name']

        #open profile url
        try:
            res = browser.open('http://m.facebook.com/profile.php?id=%s&v=info&refid=17' % friend_id,timeout=4.0)
            html = res.read()

            document = BSXPathEvaluator(html)

            #output_line=friend_id.encode('utf-8')+' | '+friend_name.encode('utf-8')
            resume=True
            i = 1
            contact_infos = [friend_id,friend_name]

            while resume: #while further contact data available
                #look for line in table of contact details and extra contact detail
                result = document.evaluate('//div[@id="contact"]//table//tr[%d]'%i,document,None,XPathResult.STRING_TYPE,None)
                contact_info = result.stringValue
                i+=1
                if len(contact_info)==0:
                    resume=False
                else:
                    contact_info=contact_info.replace('&#064;','@') #replace html character code
                    contact_info=contact_info.replace('%40', '@') #replace url encoding
                    if 'Website' not in contact_info:
                        contact_infos.append(contact_info) #append contact info to list of infos
                        #output_line+= " | "+contact_info.encode('utf-8')
            #if len(contact_infos)>2: #if contact info apart from id and name was found
            #logger.info(
                #stages[2]+'****************************************************\n'+
                #stages[2]+'** '+output_line+'\n'+
                #stages[2]+'****************************************************'
            #)
            logger.info(contact_infos)

            writer.writerow(contact_infos) #write to csv
        except URLError as e:
            logger.error('%s a URL TIMEOUT occured while fetching data for %s: %s' % (stages[2],friend_name,str(e)))
        except socket.error as e:
            logger.error('%s a SOCKET ERROR occured while fetching data for %s: %s' % (stages[2],friend_name,str(e)))
        except:
            logger.error('%s an error occured while fetching data for %s: %s' % (stages[2],friend_name,sys.exc_info()))

    endTime = time.time() #set end time for calculation of 'time elapsed'
    logger.info('%s fetched data of %d friends in %d seconds' %(stages[2],len(friends['data']),endTime-startTime))
    logger.info('%s saved collection of contact data in %s.csv! \n program will exit when crawling is finished...' % (stages[2], usr))
Exemple #15
0
def test():
    global document, options, DEFAULT_TESTDIR, url_data

    def nodesStr(nodes):
        def tagstr(node):
            try:
                strs = ['<' + node.name]
                i = node.get('id')
                c = node.get('class')
                if i:
                    strs.append('id=' + i)
                if c:
                    strs.append('class=' + c)
                return escapeStr(' '.join(strs) + '>')
            except:
                return escapeStr(unicode(node))

        if isinstance(nodes, list):
            return ' '.join([tagstr(node) for node in nodes])
        elif getattr(nodes, 'nodeType', None) or isinstance(nodes, basestring):
            return escapeStr(unicode(nodes))
        else:
            return nodes

    if options.web:
        fp = urllib2.urlopen(url_data)
        dirdoc = BSXPathEvaluator(fp.read())
        files = map(lambda node: node.get('href'),
                    dirdoc.getItemList('//li/a[@href!="../"]'))
    else:
        if options.path:
            testdir = options.path
        else:
            testdir = DEFAULT_TESTDIR
        files = os.listdir(testdir)

    tnames = ','.join(options.names).split(',') if options.names else None
    tnumbers = ','.join(
        options.numbers).split(',') if options.numbers else None
    for name in files:
        if tnames:
            fname = re.sub(r'\..*$', '', name)
            if not fname in tnames: continue
        target = url_data + '/' + name if options.web else os.path.join(
            testdir, name)
        data = parseTestData(target, options.web)
        print '[%s]\n%s\n' % (name, data.comment)
        document = BSXPathEvaluator(data.html)
        context = document.evaluate(data.contextExpr, document, None,
                                    XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
                                    None).snapshotItem(0)
        tests = data.tests
        cnt = 0
        for test in tests:
            cnt = cnt + 1
            if tnumbers:
                if not str(cnt) in tnumbers: continue
            print u'No.%d' % cnt
            expr = test.expr
            print u'expr  : %s' % (expr)

            (nodes, time, resultType) = document.applyXPath(context, expr)

            print u'time  : %d.%06d sec' % (time.seconds, time.microseconds)
            print u'result: %s' % nodesStr(nodes)
            print u'expect: %s' % (test.data)

            judge = testNodes(nodes, test.data)

            print u'judge : %s (%s)' % (judge.status, judge.detail)
            print u''

        print u''