Python tokenize_string Exemples, build_corpus.tokenize_string Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : write_like.py Projet : alexwlchan/hemingway

    def style_convert(self, infile_name, outfile_name):
        """ For each word in input text, look up synonyms in the
            author's thesaurus and probabilistically select a
            replacement word. Write output to outfile. """

        with open(infile_name, 'r') as infile, open(outfile_name, 'w') as outfile:

            for line in infile:

                # Tokenize full input file by spaces + punctuation (not apostrophe/hyphen)
                text = tokenize_string(line)

                for index, orig_word in enumerate(text):
                    was_title = orig_word.istitle()        # "Title"
                    was_capitalized = orig_word.isupper()  # "UPPER"
                    was_lower = orig_word.islower()        # "lower"

                    word = orig_word.strip().lower()

                    # Reject non-ASCII characters
                    try:
                        word = word.decode('ascii')
                    except (UnicodeDecodeError, UnicodeEncodeError):
                        continue

                    # Probabilistically choose a synonym in thesaurus[word]
                    weighted_key = self._weighted_choice(word)

                    # Match capitalization of original word
                    if was_title:
                        weighted_key = weighted_key.title()
                    elif was_capitalized:
                        weighted_key = weighted_key.upper()
                    elif not was_lower: 
                        weighted_key = orig_word

                    # Add a space between words, no space for punctuation
                    if word not in string.punctuation and index != 0:
                        outfile.write(" ")

                    outfile.write(weighted_key)
                outfile.write('\n')

        return outfile_name

Exemple #2

0

Afficher le fichier

Fichier : build_lyric_corpus.py Projet : alexwlchan/hemingway

def compile_lyrics(artist_name, output_filename):
    response = requests.get(artist_url + artist_name, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'})

    soup = BeautifulSoup(response.text, "lxml")

    with open(output_filename, 'a') as f:
        for song_link in soup.select('ul.song_list > li > a'):
            link = urljoin(BASE_URL, song_link['href'])
            response = requests.get(link)
            soup = BeautifulSoup(response.text)
            lyrics = soup.find('div', class_='lyrics').text.strip().split('\n')

            for line in lyrics:

                # Reject non-ASCII lines
                try:
                    line = line.decode('ascii')
                except (UnicodeDecodeError, UnicodeEncodeError):
                    continue

                if not re.match(ASIDE_REGEX, line):
                    f.write(" ".join(tokenize_string(line)) + '\n')

Exemple #3

0

Afficher le fichier

Fichier : write_like.py Projet : alexwlchan/hemingway

    def style_convert_lesk(self, infile_name, outfile_name):
        """ For each word in input text, look up synonyms in the
            author's thesaurus and probabilistically select a
            replacement word. Write output to outfile. """

        with open(infile_name, 'r') as infile, open(outfile_name, 'w') as outfile:

            for line in infile:

                # POS tag, and then lesk-ify the input,
                # look it up in the thesauri
                try:
                    line = line.decode('ascii', 'ignore')
                except (UnicodeDecodeError, UnicodeEncodeError):
                    continue
                line = tokenize_string(line)

                tagged_tuples = nltk.pos_tag(line)

                untagged_string = " ".join([tagged_tuple[0] for tagged_tuple in tagged_tuples])

                for index, tagged_tuple in enumerate(tagged_tuples):

                    orig_word, temp_pos = tagged_tuple

                    word = orig_word.strip().lower()

                    was_title = orig_word.istitle()        # "Title"
                    was_capitalized = orig_word.isupper()  # "UPPER"
                    was_lower = orig_word.islower()        # "lower"

                    # Don't replace determinants
                    if temp_pos == u'DT':
                        weighted_key = word
                    else:
                        # Skip past tense verbs and nouns for synsets
                        if temp_pos in ['VBD', 'VBG', 'VBN', 'NNS', 'NNPS']:
                            synset = None
                        else:
                            # Replace word
                            # Converts penn tree bank pos tag to wordnet pos tag
                            wordnet_pos = reduce_pos_tagset(temp_pos)
                            if wordnet_pos:
                                synset = nltk_lesk(untagged_string, word, wordnet_pos)
                            else:
                                synset = nltk_lesk(untagged_string, word)

                        # Probabilistically choose a synonym in thesaurus[synset]
                        # -> Interpolates to non-WordNet/Synset if synset doesn't exist
                        weighted_key = self._weighted_choice_lesk(str(synset), word)

                    # Match capitalization of original word
                    if was_title:
                        weighted_key = weighted_key.title()
                    elif was_capitalized:
                        weighted_key = weighted_key.upper()
                    elif not was_lower: 
                        weighted_key = orig_word

                    # Add a space between words, no space for punctuation
                    if word not in string.punctuation and index != 0: 
                        outfile.write(" ")

                    outfile.write(weighted_key)
                outfile.write('\n')

        return outfile_name