def generate_indexer(usr_dataset, usr_bm_tg, feature_begin, feature_end): logging.info('generating indexer ...') indexer = Indexer(['user', 'tag', 'bookmark']) min_time = 1e30 max_time = -1 for line in usr_dataset[1:]: line_items = line.split('\t') contact_timestamp = float(line_items[2]) / 1000 min_time = min(min_time, contact_timestamp) max_time = max(max_time, contact_timestamp) if feature_begin < contact_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('user', line_items[1]) for line in usr_bm_tg[1:]: line_items = line.split('\t') tag_timestamp = float(line_items[3]) / 1000 if feature_begin < tag_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('bookmark', line_items[1]) indexer.index('tag', line_items[2]) with open('delicious/data/metadata.txt', 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Users: %d\n' % indexer.indices['user']) output.write('#Tags: %d\n' % indexer.indices['tag']) output.write('#Bookmarks: %d\n' % indexer.indices['bookmark']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Contact: %d\n' % len(usr_dataset)) output.write('#Save : %d\n' % len(usr_bm_tg)) output.write('#Attach: %d\n' % len(usr_bm_tg)) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % datetime.fromtimestamp(min_time)) output.write('To: %s\n' % datetime.fromtimestamp(max_time)) return indexer
#from preprocess.tacotron.audio import inv_spectrogram, save_wav from scipy.io.wavfile import write from preprocess.tacotron.mcep import mc2wav if __name__ == '__main__': feature = 'sp' hps = Hps() hps.load('./hps/v19.json') hps_tuple = hps.get_tuple() solver = Solver(hps_tuple, None) solver.load_model('/storage/model/voice_conversion/v19/model.pkl-59999') if feature == 'mc': # indexer to extract data indexer = Indexer() src_mc = indexer.index(speaker_id='225', utt_id='366', dset='test', feature='norm_mc') tar_mc = indexer.index(speaker_id='226', utt_id='366', dset='test', feature='norm_mc') expand_src_mc = np.expand_dims(src_mc, axis=0) expand_tar_mc = np.expand_dims(tar_mc, axis=0) src_mc_tensor = torch.from_numpy(expand_src_mc).type(torch.FloatTensor) tar_mc_tensor = torch.from_numpy(expand_tar_mc).type(torch.FloatTensor) c1 = Variable(torch.from_numpy(np.array([0]))).cuda() c2 = Variable(torch.from_numpy(np.array([1]))).cuda() results = [src_mc] result = solver.test_step(src_mc_tensor, c1) result = result.squeeze(axis=0).transpose((1, 0)) results.append(result)
def generate_papers(datafile, feature_begin, feature_end, observation_begin, observation_end, conf_list): logging.info('generating papers ...') # try: # result = pickle.load(open('dblp/data/papers_%s.pkl' % path, 'rb')) # return result # except IOError: # pass indexer = Indexer(['author', 'paper', 'term', 'venue']) index, authors, title, year, venue = None, None, None, None, None references = [] write = 0 cite = 0 include = 0 published = 0 min_year = 3000 max_year = 0 papers_feature_window = [] papers_observation_window = [] with open(datafile) as file: dataset = file.read().splitlines() for line in dataset: if not line: if year and venue: year = int(year) if year > 0 and authors and venue in conf_list: min_year = min(min_year, year) max_year = max(max_year, year) authors = authors.split(',') terms = parse_term(title) write += len(authors) cite += len(references) include += len(terms) published += 1 p = Paper(year) if feature_begin < year <= feature_end: p.id = indexer.index('paper', index) p.terms = [ indexer.index('term', term) for term in terms ] p.references = [ indexer.index('paper', paper_id) for paper_id in references ] p.authors = [ indexer.index('author', author_name) for author_name in authors ] p.venue = indexer.index('venue', venue) bisect.insort(papers_feature_window, p) elif observation_begin < year <= observation_end: p.references = references p.authors = authors papers_observation_window.append(p) index, authors, title, year, venue = None, None, None, None, None references = [] else: begin = line[1] if begin == '*': title = line[2:] elif begin == '@': authors = line[2:] elif begin == 't': year = line[2:] elif begin == 'c': venue = line[2:] elif begin == 'i': index = line[6:] elif begin == '%': references.append(line[2:]) for p in papers_observation_window: authors = [] references = [] for author in p.authors: author_id = indexer.get_index('author', author) if author_id is not None: authors.append(author_id) for ref in p.references: paper_id = indexer.get_index('paper', ref) if paper_id is not None: references.append(paper_id) p.authors = authors p.references = references with open('dblp/data/metadata_%s.txt' % path, 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Authors: %d\n' % indexer.indices['author']) output.write('#Papers: %d\n' % indexer.indices['paper']) output.write('#Venues: %d\n' % indexer.indices['venue']) output.write('#Terms: %d\n\n' % indexer.indices['term']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Write: %d\n' % write) output.write('#Cite: %d\n' % cite) output.write('#Publish: %d\n' % published) output.write('#Contain: %d\n' % include) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % min_year) output.write('To: %s\n' % max_year) result = papers_feature_window, papers_observation_window, indexer.indices # pickle.dump(result, open('dblp/data/papers_%s.pkl' % path, 'wb')) return result
def generate_indexer(user_rates_movies_ds, user_tags_movies_ds, movie_actor_ds, movie_director_ds, movie_genre_ds, movie_countries_ds, feature_begin, feature_end): logging.info('generating indexer ...') min_time = 1e30 max_time = -1 indexer = Indexer( ['user', 'tag', 'movie', 'actor', 'director', 'genre', 'country']) for line in user_rates_movies_ds[1:]: line_items = line.split('\t') rating_timestamp = float(line_items[3]) / 1000 min_time = min(min_time, rating_timestamp) max_time = max(max_time, rating_timestamp) rating = float(line_items[2]) if feature_begin < rating_timestamp <= feature_end and rating > rating_threshold: indexer.index('user', line_items[0]) indexer.index('movie', line_items[1]) for line in user_tags_movies_ds[1:]: line_items = line.split('\t') tag_timestamp = float(line_items[3]) / 1000 if feature_begin < tag_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('movie', line_items[1]) indexer.index('tag', line_items[2]) for line in movie_actor_ds[1:]: line_items = line.split('\t') ranking = int(line_items[3]) if ranking < actor_threshold and line_items[0] in indexer.mapping[ 'movie']: # indexer.index('movie', line_items[0]) indexer.index('actor', line_items[1]) for line in movie_director_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('director', line_items[1]) for line in movie_genre_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('genre', line_items[1]) for line in movie_countries_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('country', line_items[1]) with open('movielens/data/metadata.txt', 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Users: %d\n' % indexer.indices['user']) output.write('#Tags: %d\n' % indexer.indices['tag']) output.write('#Movies: %d\n' % indexer.indices['movie']) output.write('#Actors: %d\n' % indexer.indices['actor']) output.write('#Director: %d\n' % indexer.indices['director']) output.write('#Genre: %d\n' % indexer.indices['genre']) output.write('#Countriy: %d\n' % indexer.indices['country']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Rate: %d\n' % len(user_rates_movies_ds)) output.write('#Attach: %d\n' % len(user_tags_movies_ds)) output.write('#Played_by: %d\n' % len(movie_actor_ds)) output.write('#Directed_by : %d\n' % len(movie_director_ds)) output.write('#Has: %d\n' % len(movie_genre_ds)) output.write('#Produced_in: %d\n' % len(movie_countries_ds)) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % datetime.fromtimestamp(min_time)) output.write('To: %s\n' % datetime.fromtimestamp(max_time)) return indexer