Example #1
0
def replace_html_codes(txt):
    txt = to_utf8(txt)
    txt = re.sub(r"(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser().unescape(txt)
    txt = txt.replace(""", "\"")
    txt = txt.replace("&", "&")
    return txt
Example #2
0
def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_emoticon(text)
        text = jaconv.h2z(text)
        text = text.replace('よぉ', 'よ').replace('よぉ', 'よ')
        text = text.replace('よお', 'よ').replace('よお', 'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text
Example #3
0
def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_emoticon(text)
        text = jaconv.h2z(text)
        text = text.replace('よぉ', 'よ').replace('よぉ', 'よ')
        text = text.replace('よお', 'よ').replace('よお', 'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text
Example #4
0
def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_useless_symbol(text)
        text = text.replace(u'γ⌒ヽ', '')
        text = jctconv.h2z(text)
        text = text.replace(u'よぉ', u'よ').replace(u'よぉ', u'よ')
        text = text.replace(u'よお', u'よ').replace(u'よお', u'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text
Example #5
0
def to_csv_row(category, scraped_regex):
    row = {'category': category}

    try:
        regex_bytes = bytes(scraped_regex[0].text, encoding='utf-8')
        regex_text = str(regex_bytes, encoding='utf-8')
        unescaped_regex = HTMLParser().unescape(regex_text)

        # Data quality check: skip regexes that contain new lines.
        if "\n" in unescaped_regex:
            return None

        clean_regex = unescaped_regex.replace(" ", "")
        # More cleaning: remove optional double quotes surrouding regex.
        if clean_regex.startswith('"') and clean_regex.endswith('"'):
            clean_regex = clean_regex[1:-1]
        row['regex'] = clean_regex
    except Exception as e:
        # Escaping won't throw exceptions for the included html files.
        template = 'Exception while escaping regex: type: {0}, args:\n{1!r}'
        msg = template.format(type(e).__name__, e.args)
        print(msg)
        return None

    return row
Example #6
0
def get_filename_from_title(title, ext='.m4a'):
    """
    Creates a filename from title
    """
    if not title:
        return 'music' + ext
    title = HTMLParser().unescape(title)
    for _ in FILENAME_EXCLUDE:
        title = title.replace(_, ' ')  # provide readability with space
    return title + ext  # TODO - smart hunt
Example #7
0
def normalize_string(string, charset=None, replacing=False):
    """
    Decode and Convert to Unicode any string
    :param charset: encoding
    :type charset: str
    :param string: string to convert
    :type string: str or unicode
    :param replacing: Whether is ' is replaced
    :type replacing: bool
    :return: converted unicode
    :rtype: unicode
    """
    if not isinstance(string, unicode):
        try:
            if re.search(u'=[0-9a-fA-F]{2}', string):
                string = py2_decode(string, 'Quoted-printable')

            string = json.loads(u'%s' % string, encoding=charset)

        except ValueError:
            try:
                string = unicode(eval(string), 'raw_unicode_escape')

            except (SyntaxError, NameError):
                string = py2_decode(string, 'latin-1')
                pass

            except TypeError:
                string = unicode(string, errors='ignore')
                pass

        except LookupError:
            return u''

        except TypeError:
            string = unicode(string, errors='ignore')
            pass

    string = remove_control_chars(string)
    string = fix_bad_unicode(string)
    string = unquote(string)
    string = string.replace(u'<![CDATA[', u'').replace(u']]', u'')
    string = HTMLParser().unescape(string)
    if replacing:
        string = string.replace(u"'", '')

    string = string.lower()

    return string
Example #8
0
    def _ircfy_tweet(self, tweet):
        """Takes a twitter status and outputs irc message."""
        message = tweet.text
        urls = tweet.urls
        if tweet.retweeted_status:
            #HACK: because iPhone sucks and does not correctly handle RT
            message = "RT @{0}: {1}".format(
                tweet.retweeted_status.user.screen_name,
                tweet.retweeted_status.text)
            urls = tweet.urls
        try:
            message = HTMLParser().unescape(message)
        except:
            self.log.exception("Unable to escape message %r", message)

        message = "{surround}{screen_name}{surround}: {message}".format(
            surround=IRC_BOLD,
            screen_name=tweet.user.screen_name,
            message=message)
        message = message.replace('\r', '').replace('\n', '  ')
        urls = self._urls_to_dict(urls)
        message = self._handle_url_expansion(message, urls, 440)
        return message
Example #9
0
def clean_html(content):
    content = HTMLParser().unescape(content)
    content = content.replace('\xa0', ' ')
    return content
Example #10
0
def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(
        r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(
        r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$"
    )
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
                additional_date_regex,
                date_regex,
                preceding_square_braces_regex,
                square_braces_regex,
                round_braces_regex,
                angle_braces_regex,
                release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(r"^(national\s+geographic\s*:|наука\s+2\.0)\s+", "",
                          torrent_name)

    torrent_name = re.sub(r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*",
        r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(
            r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)",
            "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
            "январь",
            "января",
            "февраль",
            "февраля",
            "март",
            "марта",
            "апрель",
            "апреля",
            "май",
            "мая",
            "июнь",
            "июня",
            "июль",
            "июля",
            "август",
            "августа",
            "сентябрь",
            "сентября",
            "октябрь",
            "октября",
            "ноябрь",
            "ноября",
            "декабрь",
            "декабря",
    ):
        torrent_name = re.sub(r"\b" + month + r"\b", "", torrent_name)
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()
Example #11
0
def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
            additional_date_regex,
            date_regex,
            preceding_square_braces_regex,
            square_braces_regex,
            round_braces_regex,
            angle_braces_regex,
            release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(
        r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(
        r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(
        c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
        "январь",   "января",
        "февраль",  "февраля",
        "март",     "марта",
        "апрель",   "апреля",
        "май",      "мая",
        "июнь",     "июня",
        "июль",     "июля",
        "август",   "августа",
        "сентябрь", "сентября",
        "октябрь",  "октября",
        "ноябрь",   "ноября",
        "декабрь",  "декабря",
    ):
        torrent_name = torrent_name.replace(month, "")
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()