def get_ngrams(self, n): # Generate ngrams from the dialogues for searching purposes. full_text = '' for dia in self.dialogues: full_text += dia[1] + ' ' return get_ngrams(tokenize(normalize(full_text)), n)
def find_scene_from_transcript(self, transcript): transcript_text = '' for t in transcript.keys(): for alternative in transcript[t]: transcript_text += ' ' + alternative tokens = tokenize(normalize(transcript_text)) scores = [] for scene in self.scenes: curr_score = 0 for num_ngrams in range(1, 6): scene_ngrams = scene.get_ngrams(num_ngrams) dialogue = get_ngrams(tokens, num_ngrams) for ngram in dialogue: if ngram in scene_ngrams: curr_score += 1 scores.append(curr_score) match = np.argmax(scores) if scores[match] == 0: return -1 else: return int(match)
def __init__(self, text='', lines=[]): # If text is not empty, use text to build scene. if text != '': text = text.split('\n') # Header is the first line. self.header = text[0] # Body is the rest of the text. self.body = '\n'.join(text[1:]) self.tokens = tokenize(normalize(self.body)) # Otherwise, if lines is not empty, use it instead. Preferable and more thorough. elif len(lines) != 0: # Identify the header of the scene. if lines[0][0] == 'Scene Heading': self.header = lines[0][1] else: self.header = 'No header' # Identify the main pieces of a scene. self.body = '' self.characters = [] self.actions = [] self.dialogues = [] last_character = '' for i in range(1, len(lines)): self.body += lines[i][1] + '\n' # If the element is character, add it if it's not already in the list of characters. if lines[i][0] == 'Character': character = lines[i][1] # Gets rid of (V.O.), (CONT'D), etc. if character.find(' (') != -1: character = character[0:character.find(' (')] last_character = character if character not in self.characters: self.characters.append(character) # If the element is an action, add it to the list of actions. elif lines[i][0] == 'Action': self.actions.append(lines[i][1]) # If the element is dialogue, add it to the list of dialogues in a tuple of the form # (name of the character, dialogue). elif lines[i][0] == 'Dialogue': self.dialogues.append((last_character, lines[i][1]))
def main(): input_string,output_filepath=validate_arguments() print2("1) Checking syntax... ") if not string_processing.validate_syntax(input_string): sys.exit('\n ERROR: Invalid input string syntax \"%s\". Input must be of the form (CV)+ where C is a consonant taken from {m,l,s,p,k} and V a vowel taken from {a,A}.' % input_string) print "done, syntax ok." diphones_path="diphones" print2("2) Loading diphones wav files from folder %s... " % diphones_path) diphones_wavs_db=diphone_db.read_diphones(diphones_path) print "done, %d diphones loaded." % len(diphones_wavs_db) print2("3) Parsing diphones from: \"%s\"... " % input_string) diphones=string_processing.tokenize(input_string) print "done, diphones to synthesize: %s." % str(diphones) print2("4) Synthesizing... ") output_wav=ttslib.synthesize(diphones_wavs_db,diphones) print "done." print2("5) Saving file %s..." % output_filepath) wavlib.write_wav(output_wav,output_filepath) print "done."
import string_processing as sp # Values necessary to load the network. vocab = pd.read_csv("vocabulary.txt", names=['ind', 'word'], encoding='iso-8859-1') vocab = pd.Series(vocab['ind'].values, index=vocab['word']).to_dict() vocab_size = du.get_vocab_size("vocabulary.txt") # Load the network. network = lstm.LSTMSentiment(vocab_size) network.load_state_dict(torch.load('model')) network.eval() # Get user input. user_sentence = input("Enter a review: ") # Process user input and convert it to tokens. user_sentence = sp.normalize(user_sentence) user_sentence = sp.tokenize(user_sentence) user_sentence = sp.get_numbers(user_sentence, vocab) user_sentence = sp.padding(user_sentence, 30) # Predict and output results. output, h = network(torch.LongTensor([user_sentence]), network.init_hidden(1)) pred = torch.round(output.squeeze()) if pred.item() == 1: print("Fresh") else: print("Rotten")
# Local libraries import string_processing # Constants MAX_LEN = 30 MIN_LEN = 2 # Get reviews from csv file. df = pd.read_csv("rotten_tomatoes_reviews.csv", encoding='iso-8859-1') # s/o nicolas-gervais from r/datasets # Generate tokens from the text reviews. df['Review_Clean'] = df['Review'].apply( lambda x: string_processing.normalize(x)) df['Tokens'] = df['Review_Clean'].apply( lambda x: string_processing.tokenize(x)) df.drop(['Review', 'Review_Clean'], axis=1, inplace=True) # Get rid of reviews with word count below the minimum length. df = df[df['Tokens'].apply(lambda x: len(x) > MIN_LEN)] df.reset_index(drop=True, inplace=True) # Generate a vocabulary. vocab = string_processing.build_vocab(df['Tokens'].tolist(), 'vocabulary.txt') # Replace tokens with their respective numbers in the vocabulary. df['Tokens'] = df['Tokens'].apply( lambda x: string_processing.get_numbers(x, vocab)) # Add zero-padding. df['Tokens'] = df['Tokens'].apply(