コード例 #1
0
ファイル: 爬虫1.py プロジェクト: Yvette-lri/pyproject
def get_blog_content(file_path, result=[]):
    #判断当前页面没有结果了,终止程序
    if len(result) == 0:
        sys.exit()

    #遍历搜索结果,取得博客内容
    for i in range(0, len(result)):
        h = get_url_html(result[i])
        h = html.fromstring(str(h))
        res_title = h.xpath(
            '//*[@id="mainBox"]/main/div[1]/div/div/div[1]/h1/text()')
        if res_title:
            title = res_title[0]
        res_content = h.xpath('//*[@id="content_views"]')
        if res_content:
            res_content = html.tostring(res_content[0])
            content = HTMLParser().unescape(res_content.decode())
            content = re.sub('<p.*?>', '', content)
            content = re.sub('<div.*?>', '', content)
            content = re.sub('<a.*?>', '', content)
            content = re.sub('</.*?>', '', content)
            content = re.sub('<h.*?>', '', content)
            content = re.sub('<strong>', '', content)
            content = re.sub('<ol>', '', content)
            content = re.sub('<li>', '', content)
            content = re.sub('<br>', '', content)
            content = re.sub('<span.*?>', '', content)
            content = content.strip()
            print(title)
            title = re.sub('/', '', title)
            file = open(file_path + '/' + title + '.txt',
                        mode='w',
                        encoding='utf-8')
            file.write(content)
            file.close
コード例 #2
0
def transform_title(title, site):
    """Miscellaneous title transformations.

    Handle some unicode, unescape HTML, simplify hierarchical titles, ...

    """
    title = HTMLParser().unescape(title)
    title = title.strip()
    title = parse_fancy_titles(title, site)
    return title
コード例 #3
0
def clean_str(string):
    # 去除html标签
    dr = re.compile(r'<[^>]+>', re.S)
    string = dr.sub('', string)
    # 统一全角标点
    for c in en_punctuation_set:
        if c in string:
            string = string.replace(c, semi_angle_to_sbc(c))
    # 去除html中的特殊字符
    string = HTMLParser().unescape(string)
    # 将字母统一转成小写
    string = string.lower()
    # 去除重复的符号
    string = clean_redundant(string, '?')
    string = clean_redundant(string, ',')
    string = clean_redundant(string, '……')
    string = clean_redundant(string, '。')
    return string.strip()
コード例 #4
0
ファイル: xmlformatter-gtk.py プロジェクト: danesc87/scripts
class XMLParser(object):
    '''Class that tries to parse a xml file, using zenity to display any error in process'''
    def __init__(self):
        self.file_name = None
        self.xml_to_be_parsed = None

    def get_file(self):
        try:
            self.file_name = sys.argv[1]
        except IndexError:
            os.system(
                'zenity --info --title="Error" --text="You must specify a file name!"'
            )

    def parse_xml(self):
        if os.path.isfile(self.file_name) and os.access(
                self.file_name, os.R_OK):
            read_only_file = open(self.file_name, 'r')
            self.xml_to_be_parsed = HTMLParser().unescape(
                read_only_file.read())
            try:
                self.xml_to_be_parsed = minidom.parseString(
                    self.xml_to_be_parsed).toprettyxml()
            except ExpatError as bad_xml:
                os.system(
                    'zenity --info --title="Error Parsing File" --text="XML file %s"'
                    % str(bad_xml))
            # Handle issue with CDATA section due minidom add extraspace
            #before/after CDATA
            self.xml_to_be_parsed = re.sub('>\s+<!', '><!',
                                           self.xml_to_be_parsed)
            self.xml_to_be_parsed = re.sub(']>\s+<', ']><',
                                           self.xml_to_be_parsed)
            writable_file = open(self.file_name, 'w')
            writable_file.write("".join([
                s for s in self.xml_to_be_parsed.strip().splitlines(True)
                if s.strip()
            ]))
            writable_file.close()
        else:
            os.system(
                'zenity --info --title="Error" --text="File is missing or is not readable!"'
            )
コード例 #5
0
ファイル: admin.py プロジェクト: nathan0/falco
def zeroclick(irc, source, msgtarget, args):
    params = {"q":args[0]}
    url = "http://duckduckgo.com/lite/?"
    #try:
    data = requests.get(url, params=params).content.decode()
    search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""",data,re.M|re.DOTALL)
    if search:
        answer = HTMLParser().unescape(search[-1].replace("<br>"," ").replace("<code>"," ").replace("</code>"," "))
        answer = re.sub("<[^<]+?>"," ",answer)
        out = re.sub("\s+"," ",answer.strip())
        if out:
            #if len(out.split(" More at")[0].split("}")[-1].strip()) < 400:
            irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip())
            #else:
            #    irc.msg(source.split("!")[0], out.split(" More at")[0].split("}")[-1].strip())
        else: 
            irc.msg(msgtarget, "No results")
    else:
        irc.msg(msgtarget, "No results found.")
コード例 #6
0
ファイル: admin.py プロジェクト: astravexton/falco
def zeroclick(irc, source, msgtarget, args):
    params = {"q": args[0]}
    url = "http://duckduckgo.com/lite/?"
    #try:
    data = requests.get(url, params=params).content.decode()
    search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""", data, re.M | re.DOTALL)
    if search:
        answer = HTMLParser().unescape(search[-1].replace("<br>", " ").replace(
            "<code>", " ").replace("</code>", " "))
        answer = re.sub("<[^<]+?>", " ", answer)
        out = re.sub("\s+", " ", answer.strip())
        if out:
            #if len(out.split(" More at")[0].split("}")[-1].strip()) < 400:
            irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip())
            #else:
            #    irc.msg(source.split("!")[0], out.split(" More at")[0].split("}")[-1].strip())
        else:
            irc.msg(msgtarget, "No results")
    else:
        irc.msg(msgtarget, "No results found.")
コード例 #7
0
def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(
        r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(
        r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$"
    )
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
                additional_date_regex,
                date_regex,
                preceding_square_braces_regex,
                square_braces_regex,
                round_braces_regex,
                angle_braces_regex,
                release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(r"^(national\s+geographic\s*:|наука\s+2\.0)\s+", "",
                          torrent_name)

    torrent_name = re.sub(r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*",
        r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(
            r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)",
            "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
            "январь",
            "января",
            "февраль",
            "февраля",
            "март",
            "марта",
            "апрель",
            "апреля",
            "май",
            "мая",
            "июнь",
            "июня",
            "июль",
            "июля",
            "август",
            "августа",
            "сентябрь",
            "сентября",
            "октябрь",
            "октября",
            "ноябрь",
            "ноября",
            "декабрь",
            "декабря",
    ):
        torrent_name = re.sub(r"\b" + month + r"\b", "", torrent_name)
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()
コード例 #8
0
def strip_html(text):
    result = re.sub(r"\<.*?>", " ", text, 0, re.MULTILINE)
    result = HTMLParser().unescape(result)
    result = " ".join(result.split())
    return result.strip()
コード例 #9
0
ファイル: torrents.py プロジェクト: psyvisions/rutracker.rss
def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
            additional_date_regex,
            date_regex,
            preceding_square_braces_regex,
            square_braces_regex,
            round_braces_regex,
            angle_braces_regex,
            release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(
        r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(
        r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(
        c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
        "январь",   "января",
        "февраль",  "февраля",
        "март",     "марта",
        "апрель",   "апреля",
        "май",      "мая",
        "июнь",     "июня",
        "июль",     "июля",
        "август",   "августа",
        "сентябрь", "сентября",
        "октябрь",  "октября",
        "ноябрь",   "ноября",
        "декабрь",  "декабря",
    ):
        torrent_name = torrent_name.replace(month, "")
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()