# -*- coding: utf-8 -*- import HTML import lxml.html as html import datetime import os.path import smtplib import main from os.path import basename #from pandas import DataFrame from time import gmtime, strftime addresses, email_list = main.ReadConfig("main.ini", "mosclinic") main_domain_stat = addresses[0].split("/")[2] print main_domain_stat today = strftime("%d.%m.%Y %H:%M", gmtime()) a = [] b = [] table_data = [] for page_link in addresses: page = html.parse(page_link) #for el in page.getroot().find_class('noline'): for el in page.getroot().find_class('margin15 font_arial12 as_a2'): link = el.values()[2] if "medreview" in link: page1 = html.parse('%s' % (link)) content = page1.getroot().find_class( 'margin15 font_arial12')[0].text_content() #imgs = page1.getroot().findall(".//img[@style]") dates = page1.getroot().findall(".//meta[@itemprop]")
# -*- coding: utf-8 -*- import HTML import lxml.html as html import datetime import os.path import smtplib import main from os.path import basename #from pandas import DataFrame from time import gmtime, strftime addresses, email_list = main.ReadConfig("main.ini", "yell") main_domain_stat = addresses[0].split("/")[2] print main_domain_stat today = strftime("%d.%m.%Y %H:%M", gmtime()) a = [] b = [] table_data = [] for page_link in addresses: print page_link page = html.parse(page_link) for el in page.getroot().find_class('review_text'): a.append(unicode(el.text + "<br>" + page_link).encode( "utf-8")) #el.text.encode("ISO-8859-1"))#.decode("utf-8")) for el in page.getroot().find_class('review_date'): b.append( unicode(el.getchildren()[0].text).encode("utf-8") ) #el.getchildren()[0].text.encode("ISO-8859-1"))#.decode("utf-8"))
# -*- coding: utf-8 -*- import HTML import lxml.html as html import datetime import os.path import smtplib import main from os.path import basename #from pandas import DataFrame from time import gmtime, strftime addresses, email_list = main.ReadConfig("main.ini", "spr") main_domain_stat = addresses[0].split("/")[2] print main_domain_stat today = strftime("%d.%m.%Y %H:%M", gmtime()) a = [] b = [] table_data = [] for page_link in addresses: print page_link #main_domain_stat = 'http://mosclinic.ru/medcentres' #page_link = '%s/%s' % (main_domain_stat, "59") page = html.parse(page_link) for el in page.getroot().find_class('noline'): link = el.values()[1] if "id_tema" in link: page1 = html.parse('%s' % (link)) content = page1.getroot().findall(".//p[@style]")[0].text_content() time = page1.getroot().findall(".//span[@style]") a.append(unicode(content + "<br>" + link).encode("utf-8"))
# -*- coding: utf-8 -*- import HTML import lxml.html as html import datetime import os.path import smtplib import main from os.path import basename #from pandas import DataFrame from time import gmtime, strftime addresses, email_list = main.ReadConfig("main.ini", "med-otzyv") main_domain_stat = addresses[0].split("/")[2] today = strftime("%d.%m.%Y %H:%M", gmtime()) a = [] b = [] table_data = [] for page_link in addresses: page = html.parse(page_link) #page1.getroot().findall(".//p[@style]")[0].text_content() for el in page.getroot().find_class('comment-body'): a.append( unicode(el.text_content() + "<br>" + page_link).encode("utf-8") ) #(el.text_content().encode("ISO-8859-1"))#.decode("utf-8")) for el in page.getroot().find_class("comment-date"): #('review_date'): b.append( unicode(el.text_content()).encode("utf-8") ) #el.getchildren()[0].text.encode("ISO-8859-1"))#.decode("utf-8")) for i in range(len(a)):
#['__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__init__', '__iter__', '__len__', '__module__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_init', '_label__del', '_label__get', '_label__set', 'addnext', 'addprevious', 'append', 'attrib', 'base', 'base_url', 'body', 'clear', 'cssselect', 'drop_tag', 'drop_tree', 'extend', 'find', 'find_class', 'find_rel_links', 'findall', 'findtext', 'forms', 'get', 'get_element_by_id', 'getchildren', 'getiterator', 'getnext', 'getparent', 'getprevious', 'getroottree', 'head', 'index', 'insert', 'items', 'iter', 'iterancestors', 'iterchildren', 'iterdescendants', 'iterfind', 'iterlinks', 'itersiblings', 'itertext', 'keys', 'label', 'make_links_absolute', 'makeelement', 'nsmap', 'prefix', 'remove', 'replace', 'resolve_base_href', 'rewrite_links', 'set', 'sourceline', 'tag', 'tail', 'text', 'text_content', 'values', 'xpath'] # -*- coding: utf-8 -*- import HTML import lxml.html as html import datetime import os.path import smtplib import main from os.path import basename #from pandas import DataFrame from time import gmtime, strftime addresses, email_list = main.ReadConfig("main.ini", "apoi") main_domain_stat = addresses[0].split("/")[2] print main_domain_stat today = strftime("%d.%m.%Y %H:%M", gmtime()) a = [] b = [] table_data = [] for page_link in addresses: print page_link page = html.parse(page_link) for el in page.getroot().find_class('review_text'): a.append(el.text.encode("ISO-8859-1"))#.decode("utf-8")) for el in page.getroot().find_class('review_date'): b.append(el.getchildren()[0].text.encode("ISO-8859-1"))#.decode("utf-8"))
send_email(username, passwd, "*****@*****.**", ["*****@*****.**"], "test subject", tmp) #mailer.quit() # -*- coding: utf-8 -*- import HTML import lxml.html as html import datetime import os.path import smtplib import main from os.path import basename #from pandas import DataFrame from time import gmtime, strftime addresses, email_list = main.ReadConfig("main.ini", "moskva.tulp") main_domain_stat = addresses[0].split("/")[2] print main_domain_stat today = strftime("%d.%m.%Y %H:%M", gmtime()) a = [] b = [] table_data = [] for page_link in addresses: #main_domain_stat = 'http://mosclinic.ru/medcentres' #page_link = '%s/%s' % (main_domain_stat, "59") page = html.parse(page_link) for el in page.getroot().find_class('noline'): link = el.values()[1] if "id_tema" in link: page1 = html.parse('%s' % (link)) content = page1.getroot().findall(".//p[@style]")[0].text_content()