Esempio n. 1
0
 def test_decode_entities(self):
     # Assert HMTL entity decoder (e.g., "&" => "&")
     for a, b in (
       ("&", "&"),
       ("&", "&"),
       ("&", "&"),
       (" ", u"\xa0"),
       ("&foo;", "&foo;")):
         self.assertEqual(web.decode_entities(a), b)
     print "pattern.web.decode_entities()"
Esempio n. 2
0
 def test_decode_entities(self):
     # Assert HMTL entity decoder (e.g., "&" => "&")
     for a, b in (
       ("&", "&"),
       ("&", "&"),
       ("&", "&"),
       (" ", u"\xa0"),
       ("&foo;", "&foo;")):
         self.assertEqual(web.decode_entities(a), b)
     print "pattern.web.decode_entities()"
Esempio n. 3
0
def get_random_walkthrough_instruction(char_limit=95):
    with open(ROOT_PATH + '/games.txt') as f:
        game_url = 'http://gamefaqs.com' + random.choice([
                g for
                g in f.readlines()
                if not g.startswith('#')
            ]).strip()
        print game_url

        game_page = URL(game_url).read()

        # Look for General FAQs
        general_faqs = Element(game_page)('h2.title:contains(General)')
        if not general_faqs:
            print 'No FAQs found for', game_url
            return None

        walkthroughs = [
            elem.href
            for elem
            in general_faqs[0].parent.parent('td:first-child a')
        ]

        walkthrough_url = 'http://gamefaqs.com{}?print=1'.format(random.choice(walkthroughs))
        print walkthrough_url
        walkthrough_page = URL(walkthrough_url).read()
        walkthrough_text = decode_entities(' '.join(
            span.content
            for span
            in Element(walkthrough_page)('pre > span'))) #.replace('\r\n', '\n')

        lower_walkthrough_text = walkthrough_text.lower()
        if any(word in lower_walkthrough_text for word in filter_words):
            print 'bishoujo/adult visual novel found'
            return None

        # Strip all blocks of multiple spaces, ignore space blocks in the
        # beginning of the line.
        clean_lines = []
        for line in walkthrough_text.splitlines():
            match = RE_LEADING_WHITESPACE.match(line)
            if match:
                leading_whitespace = match.groups()[0]
            else:
                leading_whitespace = ''
            line = leading_whitespace + RE_REPEATING_WHITESPACE.sub('\\1.\n\\2', line.lstrip())
            clean_lines.append(line)
        no_whitespace_text = '\n'.join(clean_lines)


        # Cleanup
        clean_lines = []

        for line in no_whitespace_text.splitlines():
            if RE_REPEATING_CHARS.search(line):
                continue

            # Try catch headings like this and terminate the top sentence.
            # "Headline
            #      Stuff about it"
            line_added = False

            if line.startswith('\t') or line.startswith('  '):
                if not clean_lines:
                    continue
                prev_line = clean_lines[-1]
                if not prev_line:
                    continue
                if not (prev_line.startswith('\t') or prev_line.startswith(' ')) and \
                        (prev_line.strip()[-1].isalpha() or prev_line.strip()[-1] == ':'):
                    prev_line = prev_line.rstrip()
                    if prev_line:
                        if prev_line[-1].isalpha():
                            prev_line += '.'
                        else:
                            prev_line = prev_line[:-1] + '.'

                    clean_lines[-1] = prev_line

                clean_lines.append(line.rstrip())
                line_added = True

            if not line_added:
                clean_lines.append(line.rstrip())
        clean_text = '\n'.join(clean_lines)

        # Remove enumerations such as:
        # 1. hello        -> hello.
        # 2. world           world.
        clean_text = RE_ENUMERATED_LIST.sub('\\1.\n', clean_text)

        ptree = parsetree(clean_text)

        interesting_sentences = []
        for s in ptree:
            sentence_string = s.string
            sentence_string_lower = sentence_string.lower()

            if sentence_string.startswith(('I', "I ' m", '(')):
                continue

            # Filter How/Why/What/Where...
            if s[0].type == 'WRB':
                continue

            if sentence_string_lower.startswith('will'):
                continue

            if any(w.lower() in bad_words for w in sentence_string.split()):
                continue

            if (s[0].type == 'VB' and s[0].string.lower() != 'quit'):
                interesting_sentences.append(sentence_string)
            elif (len(s.words) >= 2 and (s.words[0].string.lower(), s.words[1].string.lower()) in [('do', "n't"), ('use', 'the')]):
                interesting_sentences.append(sentence_string)
            else:
                # Allow for any sentence with a verb.
                if any(word.type == 'VB' for word in s.words):
                    interesting_sentences.append(sentence_string)

        restored_sentences = [
            s.replace(' , ', ', '). \
              replace('|', ''). \
              replace(u' ’ ', " ' "). \
              replace(' ; ', ', '). \
              replace(' .', '. '). \
              replace(" 's ", "'s "). \
              replace(" ' s ", "'s "). \
              replace("s ' ", "s' "). \
              replace(u" ’ s ", "'s "). \
              replace(" 'll ", "'ll "). \
              replace(" ' ll ", "'ll "). \
              replace(" n't", "n't"). \
              replace(" 're ", "'re "). \
              replace(" ' re ", "'re "). \
              replace(" 've ", "'ve "). \
              replace(" ' ve ", "'ve "). \
              replace('$ ', '$'). \
              replace(" ( ", " ("). \
              replace(" )", ")"). \
              replace(" [ ", " ["). \
              replace(" ]", "]"). \
              replace(" { ", " {"). \
              replace(" }", "}"). \
              replace(' :', ':'). \
              replace(' ?', '?'). \
              replace('. .', '.'). \
              # replace('* *', '**'). \
              # replace('*', ''). \
              replace(' !', '!').strip()
              for s in interesting_sentences
        ]

        restored_sentences = [
            s for s in restored_sentences
            if not any(substr in s for substr in filter_substrings)
        ]

        short_sentences = list(set(s for s in restored_sentences if len(s) <= char_limit and len(s.split()) > 2))
        if not short_sentences:
            return None

        random_sentence = random.choice(short_sentences)

        return random_sentence