Ejemplo n.º 1
0
# -*- coding: utf-8 -*-
import HTML
import lxml.html as html
import datetime
import os.path
import smtplib
import main

from os.path import basename
#from pandas import DataFrame
from time import gmtime, strftime

addresses, email_list = main.ReadConfig("main.ini", "mosclinic")
main_domain_stat = addresses[0].split("/")[2]
print main_domain_stat
today = strftime("%d.%m.%Y %H:%M", gmtime())
a = []
b = []
table_data = []
for page_link in addresses:
    page = html.parse(page_link)

    #for el in page.getroot().find_class('noline'):
    for el in page.getroot().find_class('margin15 font_arial12 as_a2'):
        link = el.values()[2]
        if "medreview" in link:
            page1 = html.parse('%s' % (link))
            content = page1.getroot().find_class(
                'margin15 font_arial12')[0].text_content()
            #imgs = page1.getroot().findall(".//img[@style]")
            dates = page1.getroot().findall(".//meta[@itemprop]")
Ejemplo n.º 2
0
# -*- coding: utf-8 -*-
import HTML
import lxml.html as html
import datetime
import os.path
import smtplib
import main

from os.path import basename
#from pandas import DataFrame
from time import gmtime, strftime

addresses, email_list = main.ReadConfig("main.ini", "yell")
main_domain_stat = addresses[0].split("/")[2]
print main_domain_stat
today = strftime("%d.%m.%Y %H:%M", gmtime())
a = []
b = []
table_data = []
for page_link in addresses:
    print page_link
    page = html.parse(page_link)

    for el in page.getroot().find_class('review_text'):
        a.append(unicode(el.text + "<br>" + page_link).encode(
            "utf-8"))  #el.text.encode("ISO-8859-1"))#.decode("utf-8"))
    for el in page.getroot().find_class('review_date'):
        b.append(
            unicode(el.getchildren()[0].text).encode("utf-8")
        )  #el.getchildren()[0].text.encode("ISO-8859-1"))#.decode("utf-8"))
Ejemplo n.º 3
0
# -*- coding: utf-8 -*-
import HTML
import lxml.html as html
import datetime
import os.path
import smtplib
import main

from os.path import basename
#from pandas import DataFrame
from time import gmtime, strftime

addresses, email_list = main.ReadConfig("main.ini", "spr")
main_domain_stat = addresses[0].split("/")[2]
print main_domain_stat
today = strftime("%d.%m.%Y %H:%M", gmtime())
a = []
b = []
table_data = []
for page_link in addresses:
    print page_link
    #main_domain_stat = 'http://mosclinic.ru/medcentres'
    #page_link = '%s/%s' % (main_domain_stat, "59")
    page = html.parse(page_link)
    for el in page.getroot().find_class('noline'):
        link = el.values()[1]
        if "id_tema" in link:
            page1 = html.parse('%s' % (link))
            content = page1.getroot().findall(".//p[@style]")[0].text_content()
            time = page1.getroot().findall(".//span[@style]")
            a.append(unicode(content + "<br>" + link).encode("utf-8"))
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
import HTML
import lxml.html as html
import datetime
import os.path
import smtplib
import main

from os.path import basename
#from pandas import DataFrame
from time import gmtime, strftime

addresses, email_list = main.ReadConfig("main.ini", "med-otzyv")
main_domain_stat = addresses[0].split("/")[2]
today = strftime("%d.%m.%Y %H:%M", gmtime())
a = []
b = []
table_data = []
for page_link in addresses:
    page = html.parse(page_link)
    #page1.getroot().findall(".//p[@style]")[0].text_content()
    for el in page.getroot().find_class('comment-body'):
        a.append(
            unicode(el.text_content() + "<br>" + page_link).encode("utf-8")
        )  #(el.text_content().encode("ISO-8859-1"))#.decode("utf-8"))
    for el in page.getroot().find_class("comment-date"):  #('review_date'):
        b.append(
            unicode(el.text_content()).encode("utf-8")
        )  #el.getchildren()[0].text.encode("ISO-8859-1"))#.decode("utf-8"))

for i in range(len(a)):
Ejemplo n.º 5
0
#['__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__init__', '__iter__', '__len__', '__module__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_init', '_label__del', '_label__get', '_label__set', 'addnext', 'addprevious', 'append', 'attrib', 'base', 'base_url', 'body', 'clear', 'cssselect', 'drop_tag', 'drop_tree', 'extend', 'find', 'find_class', 'find_rel_links', 'findall', 'findtext', 'forms', 'get', 'get_element_by_id', 'getchildren', 'getiterator', 'getnext', 'getparent', 'getprevious', 'getroottree', 'head', 'index', 'insert', 'items', 'iter', 'iterancestors', 'iterchildren', 'iterdescendants', 'iterfind', 'iterlinks', 'itersiblings', 'itertext', 'keys', 'label', 'make_links_absolute', 'makeelement', 'nsmap', 'prefix', 'remove', 'replace', 'resolve_base_href', 'rewrite_links', 'set', 'sourceline', 'tag', 'tail', 'text', 'text_content', 'values', 'xpath']
  
  
  # -*- coding: utf-8 -*-
import HTML
import lxml.html as html
import datetime
import os.path
import smtplib
import main

from os.path import basename
#from pandas import DataFrame
from time import gmtime, strftime

addresses, email_list = main.ReadConfig("main.ini", "apoi")
main_domain_stat = addresses[0].split("/")[2]
print main_domain_stat
today = strftime("%d.%m.%Y %H:%M", gmtime())
a = []
b = []
table_data = []
for page_link in addresses:
    print page_link
    page = html.parse(page_link)

    for el in page.getroot().find_class('review_text'):
        a.append(el.text.encode("ISO-8859-1"))#.decode("utf-8"))
    for el in page.getroot().find_class('review_date'):
        b.append(el.getchildren()[0].text.encode("ISO-8859-1"))#.decode("utf-8"))
Ejemplo n.º 6
0
    send_email(username, passwd, "*****@*****.**", ["*****@*****.**"],
               "test subject", tmp)
    #mailer.quit()
# -*- coding: utf-8 -*-
import HTML
import lxml.html as html
import datetime
import os.path
import smtplib
import main

from os.path import basename
#from pandas import DataFrame
from time import gmtime, strftime

addresses, email_list = main.ReadConfig("main.ini", "moskva.tulp")
main_domain_stat = addresses[0].split("/")[2]
print main_domain_stat
today = strftime("%d.%m.%Y %H:%M", gmtime())
a = []
b = []
table_data = []
for page_link in addresses:
    #main_domain_stat = 'http://mosclinic.ru/medcentres'
    #page_link = '%s/%s' % (main_domain_stat, "59")
    page = html.parse(page_link)
    for el in page.getroot().find_class('noline'):
        link = el.values()[1]
        if "id_tema" in link:
            page1 = html.parse('%s' % (link))
            content = page1.getroot().findall(".//p[@style]")[0].text_content()