def _build_network(self, path, char_dict, mode, min_lines = 5): if 'agarwal' in path: source = 'agarwal' scenes = get_boundaries_agarwal(path) else: source = 'gorinski' scenes = get_boundaries_gorinski(path) var2info = ag.get_variant_as_key(char_dict) char2lines = get_char_to_lines(path, char_dict) G = nx.Graph() for i,scene in enumerate(scenes): cdl = ag.get_char_diag_list(scene, var2info, source) if mode == 'overlap': # connect all characters in this scene char_tuples = set([cd[0] for cd in cdl]) char_tuples = sorted(list(char_tuples), key=lambda x:x[0]) # sort by name for i,(cname1, _, _) in enumerate(char_tuples): if len(char2lines[cname1]) >= min_lines: for j,(cname2, gen2, score2) in enumerate(char_tuples[i+1:]): if len(char2lines[cname2]) >= min_lines: G.add_edge(cname1, cname2) else: # only connect characters who speak consecutively for i in range(len(cdl)-1): cname1, _, _ = cdl[i][0] if len(char2lines[cname1]) >= min_lines: cname2, _, _ = cdl[i+1][0] if len(char2lines[cname2]) >= min_lines: G.add_edge(cname1, cname2) return G
def _predict(self, path, char_dict): if 'agarwal' in path: source = 'agarwal' scenes = get_boundaries_agarwal(path) else: source = 'gorinski' scenes = get_boundaries_gorinski(path) male_chars = self.get_male_chars(char_dict) # soft mode var2info = ag.get_variant_as_key(char_dict) no_man_ff = 0 ff_count = 0 for scene in scenes: cdl = ag.get_char_diag_list(scene, var2info, source) ffs = ag.get_ff_conversations(cdl) ff_count += len(ffs) # len(ffs) > 0 means it passes consecutive soft for ff in ffs: if self.no_man_conversation(ff, male_chars): no_man_ff += 1 if self.binary: return 1 if self.binary: return 0 return no_man_ff, ff_count
def predict_overlap(self, path, char_dict, mode): if 'agarwal' in path: source = 'agarwal' scenes = get_boundaries_agarwal(path) else: source = 'gorinski' scenes = get_boundaries_gorinski(path) var2info = ag.get_variant_as_key(char_dict) for scene in scenes: cdl = ag.get_char_diag_list(scene, var2info, source) if self.overlap_in_scene(cdl, mode): return 1 return 0
def extract_features(self, path, char_dict): if self.verbose: print(path) if 'agarwal' in path: source = 'agarwal' scenes = get_boundaries_agarwal(path) else: source = 'gorinski' scenes = get_boundaries_gorinski(path) var2info = ag.get_variant_as_key(char_dict) feats = np.zeros(3, dtype=np.int) # counts per rating for scene in scenes: cdl = ag.get_char_diag_list(scene, var2info, source) rating = self.rate_scene(cdl) if rating >= 1: feats[rating - 1] += 1 return feats
def get_dialogue_per_scene(files, source): nested_scenes = {} for f in files: f[0] char_dict = parse_by_gender_file(f[1]) char_dict = char_dict[-1][-1] var = get_variant_as_key(char_dict) nested_scenes[f[1]] = {} scenes = os.listdir(f[0]) for s in scenes: name = f[0] + "/" + s with open(name) as fp: c = fp.readlines() k = get_char_diag_list(c, var, source) nested_scenes[f[1]][name] = k return nested_scenes
def transform(self, X): if self.verbose: print('Transforming {} samples into {}'.format( str(len(X)), ', '.join(self.feats))) feat_mats = [] if 'UNI' in self.feats: if self.verbose: print('Building UNIGRAMS model...') # corpus to train unigrams model - either all fem dialogue or all fem-fem dialogue diag_per_movie = [] for i, (id, path, char_dict) in enumerate(X): this_diag = '' if self.verbose and i % 50 == 0: print(i) if 'agarwal' in path: source = 'agarwal' scenes = get_boundaries_agarwal(path) else: source = 'gorinski' scenes = get_boundaries_gorinski(path) var2info = ag.get_variant_as_key(char_dict) for scene in scenes: cdl = ag.get_char_diag_list(scene, var2info, source) if self.uni_only_ff: ffs = ag.get_ff_conversations(cdl) for ff in ffs: for char, line in ff: this_diag += line else: for (char, gen, score), diag in cdl: # for each character/line if score != 'None' and float(score) > .5: line = ' '.join(diag) if len(line) > 0: this_diag += ' ' + line diag_per_movie.append(this_diag) # transform into bag-of-words unigram model if self.countvec is None: # train self.countvec = CountVectorizer(max_features=self.uni_count) unigrams = self.countvec.fit_transform(diag_per_movie) else: # test unigrams = self.countvec.transform(diag_per_movie) if self.verbose: print('Unigrams:', unigrams.shape) feat_mats.append(unigrams.toarray()) if 'SNA' in self.feats: if self.verbose: print('Building SNA features...') sn_feats = [] for i, (id, path, char_dict) in enumerate(X): if self.verbose and i % 50 == 0: print(i) sn_feats.append( self.sna.transform_into_feats(id, self.sna_mode, self.sna_min_lines, self.sna_centralities)) sn_feats = np.array(sn_feats) if self.verbose: print('SNA features:', sn_feats.shape) feat_mats.append(sn_feats) if 'FRA' in self.feats: if self.verbose: print('Building FRAME features...') fr_feats = [] for i, (id, path, char_dict) in enumerate(X): if self.verbose and i % 50 == 0: print(i) scores = self.id2frames[id] if self.fr_mode == 'both': feats = np.concatenate( (scores['ff'], scores['fm'], scores['mm']), axis=0) elif self.fr_mode == 'agency': feats = np.concatenate( (scores['ff'][:3], scores['fm'][:3], scores['fm'][6:9], scores['mm'][:3]), axis=0) elif self.fr_mode == 'power': # power feats = np.concatenate( (scores['ff'][3:], scores['fm'][3:6], scores['fm'][9:], scores['mm'][3:]), axis=0) elif self.fr_mode == 'ff': feats = scores['ff'] elif self.fr_mode == 'fm': feats = scores['fm'] elif self.fr_mode == 'ffmm': feats = np.concatenate((scores['ff'], scores['mm']), axis=0) elif self.fr_mode == 'mm': feats = scores['mm'] else: raise ValueError('Invalid frame mode:', self.fr_mode) fr_feats.append(feats) fr_feats = np.array(fr_feats) fr_feats = MinMaxScaler().fit_transform(fr_feats) if self.verbose: print('FRAME features:', fr_feats.shape) feat_mats.append(fr_feats) if 'RB' in self.feats: if self.verbose: print('Building RULE-BASED features...') X_rb = [(x[1], x[2]) for x in X] rb_feats = self.rb.predict(X_rb) rb_feats = np.array(rb_feats) if self.verbose: print('RB features:', rb_feats.shape) feat_mats.append(rb_feats) X = np.concatenate(feat_mats, axis=1) if self.verbose: print('X-shape:', X.shape) return X