def read_word_embeddings(embeddings_file: str) -> WordEmbeddings:
    """
    Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
    that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized
    word embedding files.
    :param embeddings_file: path to the file containing embeddings
    :return: WordEmbeddings object reflecting the words and their embeddings
    """
    f = open(embeddings_file)
    word_indexer = Indexer()
    vectors = []
    # Make position 0 a PAD token, which can be useful if you
    word_indexer.add_and_get_index("PAD")
    # Make position 1 the UNK token
    word_indexer.add_and_get_index("UNK")
    for line in f:
        if line.strip() != "":
            space_idx = line.find(' ')
            word = line[:space_idx]
            numbers = line[space_idx + 1:]
            float_numbers = [
                float(number_str) for number_str in numbers.split()
            ]
            vector = np.array(float_numbers)
            word_indexer.add_and_get_index(word)
            # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
            # of the file to see what the embedding dim is
            if len(vectors) == 0:
                vectors.append(np.zeros(vector.shape[0]))
                vectors.append(np.zeros(vector.shape[0]))
            vectors.append(vector)
    f.close()
    # Turn vectors into a 2-D numpy array
    return WordEmbeddings(word_indexer, np.array(vectors))
コード例 #2
0
    def __init__(self, args, reduced_size=None, info={}):
        super(CNN, self).__init__()
        # disc_type=DISC_TYPE_MATRIX
        self.disc_type = disc_type = args.disc_type
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 4, kernel_size=2, padding=0),
            nn.ReLU())
        # 1,4,3,3
        self.layer2 = nn.Sequential(
            nn.Conv2d(4, 8, kernel_size=2),
            nn.ReLU())
        # 1,8,2,2
        ## but for 5 lines, it is 1,8,3,3
        if args.data_type == "sonnet_endings":
            self.scorer = nn.Linear(2 * 2 * 8, 1)
        elif args.data_type == "limerick":
            self.scorer = nn.Linear(3 * 3 * 8, 1)
        self.predictor = nn.Sigmoid()
        self.args = args
        self.use_cuda = args.use_cuda

        ##
        self.g_indexer = Indexer(args)
        self.g_indexer.load('tmp/tmp_' + args.g2p_model_name + '/solver_g_indexer')
        self.g2pmodel = Model(H=info['H'], args=args, i_size=self.g_indexer.w_cnt, o_size=self.g_indexer.w_cnt,
                              start_idx=self.g_indexer.w2idx[utils.START])
        if not args.learn_g2p_encoder_from_scratch:
            print("=====" * 7, "LOADING g2p ENCODER PRETRAINED")
            model_dir = 'tmp/tmp_' + args.g2p_model_name + '/'
            state_dict_best = torch.load(model_dir + 'model_best')
            self.g2pmodel.load_state_dict(state_dict_best)
        if not args.trainable_g2p:
            assert not args.learn_g2p_encoder_from_scratch
            for param in self.g2pmodel.parameters():
                param.requires_grad = False
コード例 #3
0
    def from_word_vectors(cls, word_vectors, unique_labels):
        """Instantiate the vectorizer"""
        review_vocab = word_vectors
        rating_vocab = Indexer()

        # Add ratings
        for l in unique_labels:
            rating_vocab.add_and_get_index(l)

        return cls(review_vocab, rating_vocab)
コード例 #4
0
    def __init__(self, max_word_length):
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        self.char_vocab_index.add_and_get_index(PAD_TOKEN)  # PAD is 0
        self.char_vocab_index.add_and_get_index(
            UNK_TOKEN)  # Unknown token is 1
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)

        self.max_word_length = max_word_length
コード例 #5
0
    def __init__(self, args):
        super().__init__()

        self.args = args
        self.pad_token_id = args.pad_token_id

        # Initialize embedding layer (1)
        self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim)

        # Initialize char embedding layer
        self.char_embedding = nn.Embedding(args.char_vocab_size,
                                           args.char_embedding_dim)

        # Initialize Context2Query (2)
        self.aligned_att = AlignedAttention(args.embedding_dim,
                                            args.char_embedding_dim)

        rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU

        # Initialize passage encoder (3)
        self.passage_rnn = rnn_cell(
            args.embedding_dim * 2,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        # Initialize question encoder (4)
        self.question_rnn = rnn_cell(
            args.embedding_dim,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        self.dropout = nn.Dropout(self.args.dropout)

        # Adjust hidden dimension if bidirectional RNNs are used
        _hidden_dim = (args.hidden_dim *
                       2 if args.bidirectional else args.hidden_dim)

        # Initialize attention layer for question attentive sum (5)
        self.question_att = SpanAttention(_hidden_dim)

        # Initialize bilinear layer for start positions (6)
        self.start_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize bilinear layer for end positions (7)
        self.end_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize char indexer
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)
コード例 #6
0
 def load(self, specialTokenList=None):
     indexer = Indexer(specialTokenList)
     print "... loading training data."
     trainPairs,trainLens = self._load_pairs(indexer,
                                             self.dataDict['train_source'],
                                             self.dataDict['train_target'])
     print "... loading test data."
     testPairs,testLens = self._load_pairs(indexer,
                                           self.dataDict['test_source'],
                                           self.dataDict['test_source'])
     print "Done!\n"
     return indexer,trainPairs,trainLens,testPairs,testLens
コード例 #7
0
ファイル: delicious.py プロジェクト: krishna0709/npglm
def generate_indexer(usr_dataset, usr_bm_tg, feature_begin, feature_end):
    logging.info('generating indexer ...')
    indexer = Indexer(['user', 'tag', 'bookmark'])
    min_time = 1e30
    max_time = -1

    for line in usr_dataset[1:]:
        line_items = line.split('\t')
        contact_timestamp = float(line_items[2]) / 1000
        min_time = min(min_time, contact_timestamp)
        max_time = max(max_time, contact_timestamp)
        if feature_begin < contact_timestamp <= feature_end:
            indexer.index('user', line_items[0])
            indexer.index('user', line_items[1])

    for line in usr_bm_tg[1:]:
        line_items = line.split('\t')
        tag_timestamp = float(line_items[3]) / 1000
        if feature_begin < tag_timestamp <= feature_end:
            indexer.index('user', line_items[0])
            indexer.index('bookmark', line_items[1])
            indexer.index('tag', line_items[2])

    with open('delicious/data/metadata.txt', 'w') as output:
        output.write('Nodes:\n')
        output.write('-----------------------------\n')
        output.write('#Users: %d\n' % indexer.indices['user'])
        output.write('#Tags: %d\n' % indexer.indices['tag'])
        output.write('#Bookmarks: %d\n' % indexer.indices['bookmark'])
        output.write('\nEdges:\n')
        output.write('-----------------------------\n')
        output.write('#Contact: %d\n' % len(usr_dataset))
        output.write('#Save : %d\n' % len(usr_bm_tg))
        output.write('#Attach: %d\n' % len(usr_bm_tg))
        output.write('\nTime Span:\n')
        output.write('-----------------------------\n')
        output.write('From: %s\n' % datetime.fromtimestamp(min_time))
        output.write('To: %s\n' % datetime.fromtimestamp(max_time))

    return indexer
コード例 #8
0
from solver import Solver
from preprocess.tacotron.utils import spectrogram2wav
#from preprocess.tacotron.audio import inv_spectrogram, save_wav
from scipy.io.wavfile import write
from preprocess.tacotron.mcep import mc2wav

if __name__ == '__main__':
    feature = 'sp'
    hps = Hps()
    hps.load('./hps/v19.json')
    hps_tuple = hps.get_tuple()
    solver = Solver(hps_tuple, None)
    solver.load_model('/storage/model/voice_conversion/v19/model.pkl-59999')
    if feature == 'mc':
        # indexer to extract data
        indexer = Indexer()
        src_mc = indexer.index(speaker_id='225',
                               utt_id='366',
                               dset='test',
                               feature='norm_mc')
        tar_mc = indexer.index(speaker_id='226',
                               utt_id='366',
                               dset='test',
                               feature='norm_mc')
        expand_src_mc = np.expand_dims(src_mc, axis=0)
        expand_tar_mc = np.expand_dims(tar_mc, axis=0)
        src_mc_tensor = torch.from_numpy(expand_src_mc).type(torch.FloatTensor)
        tar_mc_tensor = torch.from_numpy(expand_tar_mc).type(torch.FloatTensor)
        c1 = Variable(torch.from_numpy(np.array([0]))).cuda()
        c2 = Variable(torch.from_numpy(np.array([1]))).cuda()
        results = [src_mc]
コード例 #9
0
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import json
import pandas as pd
from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import classification_report

include_test = True

tknr = TweetTokenizer()
indexer = get_indexer('indexer_15_dups.csv')
word_indexer = Indexer()
word_indexer.add_and_get_index("UNK")

train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000]
dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200]
test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200]

X_train = []
Y_train = []
X_dev = []
Y_dev = []
Y_dev_true = []
X_test = []
Y_test = []
Y_test_true = []
コード例 #10
0
 def __init__(self):
     self.indexer = Indexer()
コード例 #11
0
ファイル: dblp.py プロジェクト: krishna0709/npglm
def generate_papers(datafile, feature_begin, feature_end, observation_begin,
                    observation_end, conf_list):
    logging.info('generating papers ...')

    # try:
    #     result = pickle.load(open('dblp/data/papers_%s.pkl' % path, 'rb'))
    #     return result
    # except IOError:
    #     pass

    indexer = Indexer(['author', 'paper', 'term', 'venue'])

    index, authors, title, year, venue = None, None, None, None, None
    references = []

    write = 0
    cite = 0
    include = 0
    published = 0

    min_year = 3000
    max_year = 0

    papers_feature_window = []
    papers_observation_window = []

    with open(datafile) as file:
        dataset = file.read().splitlines()

    for line in dataset:
        if not line:
            if year and venue:
                year = int(year)
                if year > 0 and authors and venue in conf_list:
                    min_year = min(min_year, year)
                    max_year = max(max_year, year)
                    authors = authors.split(',')
                    terms = parse_term(title)
                    write += len(authors)
                    cite += len(references)
                    include += len(terms)
                    published += 1

                    p = Paper(year)
                    if feature_begin < year <= feature_end:
                        p.id = indexer.index('paper', index)
                        p.terms = [
                            indexer.index('term', term) for term in terms
                        ]
                        p.references = [
                            indexer.index('paper', paper_id)
                            for paper_id in references
                        ]
                        p.authors = [
                            indexer.index('author', author_name)
                            for author_name in authors
                        ]
                        p.venue = indexer.index('venue', venue)
                        bisect.insort(papers_feature_window, p)
                    elif observation_begin < year <= observation_end:
                        p.references = references
                        p.authors = authors
                        papers_observation_window.append(p)

            index, authors, title, year, venue = None, None, None, None, None
            references = []
        else:
            begin = line[1]
            if begin == '*':
                title = line[2:]
            elif begin == '@':
                authors = line[2:]
            elif begin == 't':
                year = line[2:]
            elif begin == 'c':
                venue = line[2:]
            elif begin == 'i':
                index = line[6:]
            elif begin == '%':
                references.append(line[2:])

    for p in papers_observation_window:
        authors = []
        references = []
        for author in p.authors:
            author_id = indexer.get_index('author', author)
            if author_id is not None:
                authors.append(author_id)
        for ref in p.references:
            paper_id = indexer.get_index('paper', ref)
            if paper_id is not None:
                references.append(paper_id)
        p.authors = authors
        p.references = references

    with open('dblp/data/metadata_%s.txt' % path, 'w') as output:
        output.write('Nodes:\n')
        output.write('-----------------------------\n')
        output.write('#Authors: %d\n' % indexer.indices['author'])
        output.write('#Papers: %d\n' % indexer.indices['paper'])
        output.write('#Venues: %d\n' % indexer.indices['venue'])
        output.write('#Terms: %d\n\n' % indexer.indices['term'])
        output.write('\nEdges:\n')
        output.write('-----------------------------\n')
        output.write('#Write: %d\n' % write)
        output.write('#Cite: %d\n' % cite)
        output.write('#Publish: %d\n' % published)
        output.write('#Contain: %d\n' % include)
        output.write('\nTime Span:\n')
        output.write('-----------------------------\n')
        output.write('From: %s\n' % min_year)
        output.write('To: %s\n' % max_year)

    result = papers_feature_window, papers_observation_window, indexer.indices
    # pickle.dump(result, open('dblp/data/papers_%s.pkl' % path, 'wb'))
    return result
コード例 #12
0
	contains a header line and 45463 data lines,
	each line includes a mId and its overview (some sentences).
	'''
    movies.to_csv("processed_data/overviews.csv",
                  columns=['mId', 'overview'],
                  index=False)
    movies.to_csv("processed_data/mId2Title.csv",
                  columns=['mId', 'tmdbId', 'title'],
                  index=False)
    ''' create genres
	mId2Genre: 45463 lines, each line includes (mId, num of genres, gIds)
	Genre2Id:  20 lines, each line includes (gId, genre name)
	gId ranges from 45843 to 45862
	'''
    f = open("processed_data/mId2Genre.txt", "w")
    genreIdx = Indexer()
    for idx, row in movies.iterrows():
        mId, raw_genres = row['mId'], row['genres']
        raw_genres = raw_genres.replace("\'", "\"")
        genres_l = json.loads(raw_genres)
        f.write("%d %d" % (mId, len(genres_l)))
        for g in genres_l:
            f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base))
        f.write("\n")
    f.close()

    f = open("processed_data/Genre2Id.txt", "w")
    num_genres = len(genreIdx)
    for i in range(num_genres):
        f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i)))
    f.close()
コード例 #13
0
ファイル: movielens.py プロジェクト: krishna0709/npglm
def generate_indexer(user_rates_movies_ds, user_tags_movies_ds, movie_actor_ds,
                     movie_director_ds, movie_genre_ds, movie_countries_ds,
                     feature_begin, feature_end):
    logging.info('generating indexer ...')
    min_time = 1e30
    max_time = -1
    indexer = Indexer(
        ['user', 'tag', 'movie', 'actor', 'director', 'genre', 'country'])

    for line in user_rates_movies_ds[1:]:
        line_items = line.split('\t')
        rating_timestamp = float(line_items[3]) / 1000
        min_time = min(min_time, rating_timestamp)
        max_time = max(max_time, rating_timestamp)
        rating = float(line_items[2])
        if feature_begin < rating_timestamp <= feature_end and rating > rating_threshold:
            indexer.index('user', line_items[0])
            indexer.index('movie', line_items[1])

    for line in user_tags_movies_ds[1:]:
        line_items = line.split('\t')
        tag_timestamp = float(line_items[3]) / 1000
        if feature_begin < tag_timestamp <= feature_end:
            indexer.index('user', line_items[0])
            indexer.index('movie', line_items[1])
            indexer.index('tag', line_items[2])

    for line in movie_actor_ds[1:]:
        line_items = line.split('\t')
        ranking = int(line_items[3])
        if ranking < actor_threshold and line_items[0] in indexer.mapping[
                'movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('actor', line_items[1])

    for line in movie_director_ds[1:]:
        line_items = line.split('\t')
        if line_items[0] in indexer.mapping['movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('director', line_items[1])

    for line in movie_genre_ds[1:]:
        line_items = line.split('\t')
        if line_items[0] in indexer.mapping['movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('genre', line_items[1])

    for line in movie_countries_ds[1:]:
        line_items = line.split('\t')
        if line_items[0] in indexer.mapping['movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('country', line_items[1])

    with open('movielens/data/metadata.txt', 'w') as output:
        output.write('Nodes:\n')
        output.write('-----------------------------\n')
        output.write('#Users: %d\n' % indexer.indices['user'])
        output.write('#Tags: %d\n' % indexer.indices['tag'])
        output.write('#Movies: %d\n' % indexer.indices['movie'])
        output.write('#Actors: %d\n' % indexer.indices['actor'])
        output.write('#Director: %d\n' % indexer.indices['director'])
        output.write('#Genre: %d\n' % indexer.indices['genre'])
        output.write('#Countriy: %d\n' % indexer.indices['country'])
        output.write('\nEdges:\n')
        output.write('-----------------------------\n')
        output.write('#Rate: %d\n' % len(user_rates_movies_ds))
        output.write('#Attach: %d\n' % len(user_tags_movies_ds))
        output.write('#Played_by: %d\n' % len(movie_actor_ds))
        output.write('#Directed_by : %d\n' % len(movie_director_ds))
        output.write('#Has: %d\n' % len(movie_genre_ds))
        output.write('#Produced_in: %d\n' % len(movie_countries_ds))
        output.write('\nTime Span:\n')
        output.write('-----------------------------\n')
        output.write('From: %s\n' % datetime.fromtimestamp(min_time))
        output.write('To: %s\n' % datetime.fromtimestamp(max_time))

    return indexer