def style_convert(self, infile_name, outfile_name): """ For each word in input text, look up synonyms in the author's thesaurus and probabilistically select a replacement word. Write output to outfile. """ with open(infile_name, 'r') as infile, open(outfile_name, 'w') as outfile: for line in infile: # Tokenize full input file by spaces + punctuation (not apostrophe/hyphen) text = tokenize_string(line) for index, orig_word in enumerate(text): was_title = orig_word.istitle() # "Title" was_capitalized = orig_word.isupper() # "UPPER" was_lower = orig_word.islower() # "lower" word = orig_word.strip().lower() # Reject non-ASCII characters try: word = word.decode('ascii') except (UnicodeDecodeError, UnicodeEncodeError): continue # Probabilistically choose a synonym in thesaurus[word] weighted_key = self._weighted_choice(word) # Match capitalization of original word if was_title: weighted_key = weighted_key.title() elif was_capitalized: weighted_key = weighted_key.upper() elif not was_lower: weighted_key = orig_word # Add a space between words, no space for punctuation if word not in string.punctuation and index != 0: outfile.write(" ") outfile.write(weighted_key) outfile.write('\n') return outfile_name
def compile_lyrics(artist_name, output_filename): response = requests.get(artist_url + artist_name, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'}) soup = BeautifulSoup(response.text, "lxml") with open(output_filename, 'a') as f: for song_link in soup.select('ul.song_list > li > a'): link = urljoin(BASE_URL, song_link['href']) response = requests.get(link) soup = BeautifulSoup(response.text) lyrics = soup.find('div', class_='lyrics').text.strip().split('\n') for line in lyrics: # Reject non-ASCII lines try: line = line.decode('ascii') except (UnicodeDecodeError, UnicodeEncodeError): continue if not re.match(ASIDE_REGEX, line): f.write(" ".join(tokenize_string(line)) + '\n')
def style_convert_lesk(self, infile_name, outfile_name): """ For each word in input text, look up synonyms in the author's thesaurus and probabilistically select a replacement word. Write output to outfile. """ with open(infile_name, 'r') as infile, open(outfile_name, 'w') as outfile: for line in infile: # POS tag, and then lesk-ify the input, # look it up in the thesauri try: line = line.decode('ascii', 'ignore') except (UnicodeDecodeError, UnicodeEncodeError): continue line = tokenize_string(line) tagged_tuples = nltk.pos_tag(line) untagged_string = " ".join([tagged_tuple[0] for tagged_tuple in tagged_tuples]) for index, tagged_tuple in enumerate(tagged_tuples): orig_word, temp_pos = tagged_tuple word = orig_word.strip().lower() was_title = orig_word.istitle() # "Title" was_capitalized = orig_word.isupper() # "UPPER" was_lower = orig_word.islower() # "lower" # Don't replace determinants if temp_pos == u'DT': weighted_key = word else: # Skip past tense verbs and nouns for synsets if temp_pos in ['VBD', 'VBG', 'VBN', 'NNS', 'NNPS']: synset = None else: # Replace word # Converts penn tree bank pos tag to wordnet pos tag wordnet_pos = reduce_pos_tagset(temp_pos) if wordnet_pos: synset = nltk_lesk(untagged_string, word, wordnet_pos) else: synset = nltk_lesk(untagged_string, word) # Probabilistically choose a synonym in thesaurus[synset] # -> Interpolates to non-WordNet/Synset if synset doesn't exist weighted_key = self._weighted_choice_lesk(str(synset), word) # Match capitalization of original word if was_title: weighted_key = weighted_key.title() elif was_capitalized: weighted_key = weighted_key.upper() elif not was_lower: weighted_key = orig_word # Add a space between words, no space for punctuation if word not in string.punctuation and index != 0: outfile.write(" ") outfile.write(weighted_key) outfile.write('\n') return outfile_name