Exemple #1
0
    def __init__(self, args, reduced_size=None, info={}):
        super(CNN, self).__init__()
        # disc_type=DISC_TYPE_MATRIX
        self.disc_type = disc_type = args.disc_type
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 4, kernel_size=2, padding=0),
            nn.ReLU())
        # 1,4,3,3
        self.layer2 = nn.Sequential(
            nn.Conv2d(4, 8, kernel_size=2),
            nn.ReLU())
        # 1,8,2,2
        ## but for 5 lines, it is 1,8,3,3
        if args.data_type == "sonnet_endings":
            self.scorer = nn.Linear(2 * 2 * 8, 1)
        elif args.data_type == "limerick":
            self.scorer = nn.Linear(3 * 3 * 8, 1)
        self.predictor = nn.Sigmoid()
        self.args = args
        self.use_cuda = args.use_cuda

        ##
        self.g_indexer = Indexer(args)
        self.g_indexer.load('tmp/tmp_' + args.g2p_model_name + '/solver_g_indexer')
        self.g2pmodel = Model(H=info['H'], args=args, i_size=self.g_indexer.w_cnt, o_size=self.g_indexer.w_cnt,
                              start_idx=self.g_indexer.w2idx[utils.START])
        if not args.learn_g2p_encoder_from_scratch:
            print("=====" * 7, "LOADING g2p ENCODER PRETRAINED")
            model_dir = 'tmp/tmp_' + args.g2p_model_name + '/'
            state_dict_best = torch.load(model_dir + 'model_best')
            self.g2pmodel.load_state_dict(state_dict_best)
        if not args.trainable_g2p:
            assert not args.learn_g2p_encoder_from_scratch
            for param in self.g2pmodel.parameters():
                param.requires_grad = False
Exemple #2
0
class Crawler:
  def __init__(self, list_file):
    self.logger = Logger.get_logger(utils.get_fullname(self))
    self.list_file = list_file
    self.indexer = Indexer()
    self._client = None

  # return raw list in list format
  def parse_list(self, list_file):
    try:
      self.logger.info('Opening RSS file: %s' % list_file)
      f = open(list_file, 'r')
    except IOError:
      self.logger.error('Cannot read file: %s' % list_file)
      return -1

    self.feeds_list = []
    
    line = f.readline()
    while line: 
      self.logger.debug('Reading: %s' % line)
      feeds = feedparser.parse(line)
      try:
        # default only get the latest entry
        raw_f = feeds['entries'][0]
        feed_item = EzrssFeed(raw_f.link)
        feed_item.parse_name(raw_f.summary or raw_f.value)
        feed_item.parse_season(raw_f.summary or raw_f.value)
        self.feeds_list.append(feed_item)
      except IndexError:
        pass
      line = f.readline()

  @property
  def client(self):
    return self._client

  @client.setter
  def client(self, c):
    self._client = c

  def run(self):
    self.parse_list(self.list_file) 
    self.logger.info('Start checking latest RSS feeds')
    for feed in self.feeds_list:
      if feed.name:
        if not self.indexer.episode_exists(feed.name, feed.url):
          save_path = os.path.join(getattr(settings, 'SAVE_DIR'),
                                   feed.name,
                                   feed.season)
          self.client.start_from_url(feed.url, save_path)
          self.indexer.save(feed.name, feed.url)
      else:
        continue

    self.logger.info('Exiting application')
    def from_word_vectors(cls, word_vectors, unique_labels):
        """Instantiate the vectorizer"""
        review_vocab = word_vectors
        rating_vocab = Indexer()

        # Add ratings
        for l in unique_labels:
            rating_vocab.add_and_get_index(l)

        return cls(review_vocab, rating_vocab)
    def __init__(self, max_word_length):
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        self.char_vocab_index.add_and_get_index(PAD_TOKEN)  # PAD is 0
        self.char_vocab_index.add_and_get_index(
            UNK_TOKEN)  # Unknown token is 1
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)

        self.max_word_length = max_word_length
    def __init__(self, args):
        super().__init__()

        self.args = args
        self.pad_token_id = args.pad_token_id

        # Initialize embedding layer (1)
        self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim)

        # Initialize char embedding layer
        self.char_embedding = nn.Embedding(args.char_vocab_size,
                                           args.char_embedding_dim)

        # Initialize Context2Query (2)
        self.aligned_att = AlignedAttention(args.embedding_dim,
                                            args.char_embedding_dim)

        rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU

        # Initialize passage encoder (3)
        self.passage_rnn = rnn_cell(
            args.embedding_dim * 2,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        # Initialize question encoder (4)
        self.question_rnn = rnn_cell(
            args.embedding_dim,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        self.dropout = nn.Dropout(self.args.dropout)

        # Adjust hidden dimension if bidirectional RNNs are used
        _hidden_dim = (args.hidden_dim *
                       2 if args.bidirectional else args.hidden_dim)

        # Initialize attention layer for question attentive sum (5)
        self.question_att = SpanAttention(_hidden_dim)

        # Initialize bilinear layer for start positions (6)
        self.start_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize bilinear layer for end positions (7)
        self.end_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize char indexer
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)
def read_word_embeddings(embeddings_file: str) -> WordEmbeddings:
    """
    Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
    that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized
    word embedding files.
    :param embeddings_file: path to the file containing embeddings
    :return: WordEmbeddings object reflecting the words and their embeddings
    """
    f = open(embeddings_file)
    word_indexer = Indexer()
    vectors = []
    # Make position 0 a PAD token, which can be useful if you
    word_indexer.add_and_get_index("PAD")
    # Make position 1 the UNK token
    word_indexer.add_and_get_index("UNK")
    for line in f:
        if line.strip() != "":
            space_idx = line.find(' ')
            word = line[:space_idx]
            numbers = line[space_idx + 1:]
            float_numbers = [
                float(number_str) for number_str in numbers.split()
            ]
            vector = np.array(float_numbers)
            word_indexer.add_and_get_index(word)
            # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
            # of the file to see what the embedding dim is
            if len(vectors) == 0:
                vectors.append(np.zeros(vector.shape[0]))
                vectors.append(np.zeros(vector.shape[0]))
            vectors.append(vector)
    f.close()
    # Turn vectors into a 2-D numpy array
    return WordEmbeddings(word_indexer, np.array(vectors))
class CharTokenizer:
    """
    Class to create char tokens
    """
    def __init__(self, max_word_length):
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        self.char_vocab_index.add_and_get_index(PAD_TOKEN)  # PAD is 0
        self.char_vocab_index.add_and_get_index(
            UNK_TOKEN)  # Unknown token is 1
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)

        self.max_word_length = max_word_length

    def convert_words_to_charids(self, words):
        word_charids = []
        for w in words:
            charids = []
            for c in w:
                charids.append(self.char_vocab_index.index_of(c))
            charids = charids[:self.max_word_length]
            if len(charids) < self.max_word_length:
                charids.extend([0] * (self.max_word_length - len(charids)))
            word_charids.append(charids)

        return word_charids
 def load(self, specialTokenList=None):
     indexer = Indexer(specialTokenList)
     print "... loading training data."
     trainPairs,trainLens = self._load_pairs(indexer,
                                             self.dataDict['train_source'],
                                             self.dataDict['train_target'])
     print "... loading test data."
     testPairs,testLens = self._load_pairs(indexer,
                                           self.dataDict['test_source'],
                                           self.dataDict['test_source'])
     print "Done!\n"
     return indexer,trainPairs,trainLens,testPairs,testLens
Exemple #9
0
def generate_indexer(usr_dataset, usr_bm_tg, feature_begin, feature_end):
    logging.info('generating indexer ...')
    indexer = Indexer(['user', 'tag', 'bookmark'])
    min_time = 1e30
    max_time = -1

    for line in usr_dataset[1:]:
        line_items = line.split('\t')
        contact_timestamp = float(line_items[2]) / 1000
        min_time = min(min_time, contact_timestamp)
        max_time = max(max_time, contact_timestamp)
        if feature_begin < contact_timestamp <= feature_end:
            indexer.index('user', line_items[0])
            indexer.index('user', line_items[1])

    for line in usr_bm_tg[1:]:
        line_items = line.split('\t')
        tag_timestamp = float(line_items[3]) / 1000
        if feature_begin < tag_timestamp <= feature_end:
            indexer.index('user', line_items[0])
            indexer.index('bookmark', line_items[1])
            indexer.index('tag', line_items[2])

    with open('delicious/data/metadata.txt', 'w') as output:
        output.write('Nodes:\n')
        output.write('-----------------------------\n')
        output.write('#Users: %d\n' % indexer.indices['user'])
        output.write('#Tags: %d\n' % indexer.indices['tag'])
        output.write('#Bookmarks: %d\n' % indexer.indices['bookmark'])
        output.write('\nEdges:\n')
        output.write('-----------------------------\n')
        output.write('#Contact: %d\n' % len(usr_dataset))
        output.write('#Save : %d\n' % len(usr_bm_tg))
        output.write('#Attach: %d\n' % len(usr_bm_tg))
        output.write('\nTime Span:\n')
        output.write('-----------------------------\n')
        output.write('From: %s\n' % datetime.fromtimestamp(min_time))
        output.write('To: %s\n' % datetime.fromtimestamp(max_time))

    return indexer
Exemple #10
0
class FeatureExtractor():
    def __init__(self):
        self.indexer = Indexer()

    def get_indexer(self):
        return self.indexer

    def extract_features(self, ex):
        feature_vector = np.zeros(len(self.indexer))
        for word in ex.text:
            index = self.indexer.index_of(word)
            feature_vector[index] += 1
        return feature_vector
	contains a header line and 45463 data lines,
	each line includes a mId and its overview (some sentences).
	'''
    movies.to_csv("processed_data/overviews.csv",
                  columns=['mId', 'overview'],
                  index=False)
    movies.to_csv("processed_data/mId2Title.csv",
                  columns=['mId', 'tmdbId', 'title'],
                  index=False)
    ''' create genres
	mId2Genre: 45463 lines, each line includes (mId, num of genres, gIds)
	Genre2Id:  20 lines, each line includes (gId, genre name)
	gId ranges from 45843 to 45862
	'''
    f = open("processed_data/mId2Genre.txt", "w")
    genreIdx = Indexer()
    for idx, row in movies.iterrows():
        mId, raw_genres = row['mId'], row['genres']
        raw_genres = raw_genres.replace("\'", "\"")
        genres_l = json.loads(raw_genres)
        f.write("%d %d" % (mId, len(genres_l)))
        for g in genres_l:
            f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base))
        f.write("\n")
    f.close()

    f = open("processed_data/Genre2Id.txt", "w")
    num_genres = len(genreIdx)
    for i in range(num_genres):
        f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i)))
    f.close()
Exemple #12
0
            labels, emoji_labels = get_labels(tweet, indexer)
            label = get_most_recent_label(tweet, emoji_labels, indexer)
        except:
            continue

        cleaned_text = clean_tweet(tweet)
        datapoint = DataPoint(cleaned_text, label)
        dataset.append(datapoint)
        label_counter[indexer.get_object(label)] += 1
        count += 1
        if count % 500000 == 0:
            print("created", count, "datapoints")

    return dataset

indexer = Indexer()
label_counter = Counter()
dataset = create_dataset(tweets, indexer, label_counter

print ("length of dataset: ", len(dataset))

from tokenizer import tokenizer as vinay
v = vinay.TweetTokenizer(regularize=True, preserve_len=False)


word_cnts = Counter()
def count_words(text):
    words = v.tokenize(text)
    for word in words:
        word_cnts[word] += 1
class CharBaselineReader(nn.Module):
    """
    Baseline QA Model
    [Architecture]
        0) Inputs: passages and questions
        1) Embedding Layer: converts words to vectors
        2) Context2Query: computes weighted sum of question embeddings for
               each position in passage.
        3) Passage Encoder: LSTM or GRU.
        4) Question Encoder: LSTM or GRU.
        5) Question Attentive Sum: computes weighted sum of question hidden.
        6) Start Position Pointer: computes scores (logits) over passage
               conditioned on the question vector.
        7) End Position Pointer: computes scores (logits) over passage
               conditioned on the question vector.

    Args:
        args: `argparse` object.

    Inputs:
        batch: a dictionary containing batched tensors.
            {
                'passages': LongTensor [batch_size, p_len],
                'questions': LongTensor [batch_size, q_len],
                'start_positions': Not used in `forward`,
                'end_positions': Not used in `forward`,
            }

    Returns:
        Logits for start positions and logits for end positions.
        Tuple: ([batch_size, p_len], [batch_size, p_len])
    """
    def __init__(self, args):
        super().__init__()

        self.args = args
        self.pad_token_id = args.pad_token_id

        # Initialize embedding layer (1)
        self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim)

        # Initialize char embedding layer
        self.char_embedding = nn.Embedding(args.char_vocab_size,
                                           args.char_embedding_dim)

        # Initialize Context2Query (2)
        self.aligned_att = AlignedAttention(args.embedding_dim,
                                            args.char_embedding_dim)

        rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU

        # Initialize passage encoder (3)
        self.passage_rnn = rnn_cell(
            args.embedding_dim * 2,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        # Initialize question encoder (4)
        self.question_rnn = rnn_cell(
            args.embedding_dim,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        self.dropout = nn.Dropout(self.args.dropout)

        # Adjust hidden dimension if bidirectional RNNs are used
        _hidden_dim = (args.hidden_dim *
                       2 if args.bidirectional else args.hidden_dim)

        # Initialize attention layer for question attentive sum (5)
        self.question_att = SpanAttention(_hidden_dim)

        # Initialize bilinear layer for start positions (6)
        self.start_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize bilinear layer for end positions (7)
        self.end_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize char indexer
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)

    def load_pretrained_embeddings(self, vocabulary, path):
        """
        Loads GloVe vectors and initializes the embedding matrix.

        Args:
            vocabulary: `Vocabulary` object.
            path: Embedding path, e.g. "glove/glove.6B.300d.txt".
        """

        if self.args.embedding == 'glove':
            embedding_map = load_cached_embeddings(path)

            # Create embedding matrix. By default, embeddings are randomly
            # initialized from Uniform(-0.1, 0.1).
            embeddings = torch.zeros(
                (len(vocabulary),
                 self.args.embedding_dim)).uniform_(-0.1, 0.1)

            # Initialize pre-trained embeddings.
            num_pretrained = 0
            for (i, word) in enumerate(vocabulary.words):
                if word in embedding_map:
                    #embeddings[i] = torch.tensor(embedding_map[word])
                    num_pretrained += 1

            # Place embedding matrix on GPU.
            self.embedding.weight.data = cuda(self.args, embeddings)
        else:
            #####################
            # Loads Fasttext embeddings
            embedding_map = load_fasttext_embeddings(path)

            # Create embedding matrix. By default, embeddings are randomly
            # initialized from Uniform(-0.1, 0.1).
            embeddings = torch.zeros(
                (len(vocabulary),
                 self.args.embedding_dim)).uniform_(-0.1, 0.1)

            # Initialize pre-trained embeddings.
            num_pretrained = 0
            for (i, word) in enumerate(vocabulary.words):
                embeddings[i] = torch.tensor(
                    embedding_map.get_word_vector(word))
                num_pretrained += 1

            # Place embedding matrix on GPU.
            self.embedding.weight.data = cuda(self.args, embeddings)

        return num_pretrained

    def sorted_rnn(self, sequences, sequence_lengths, rnn):
        """
        Sorts and packs inputs, then feeds them into RNN.

        Args:
            sequences: Input sequences, [batch_size, len, dim].
            sequence_lengths: Lengths for each sequence, [batch_size].
            rnn: Registered LSTM or GRU.

        Returns:
            All hidden states, [batch_size, len, hid].
        """
        # Sort input sequences
        sorted_inputs, sorted_sequence_lengths, restoration_indices = _sort_batch_by_length(
            sequences, sequence_lengths)
        # Pack input sequences
        packed_sequence_input = pack_padded_sequence(
            sorted_inputs,
            sorted_sequence_lengths.data.long().tolist(),
            batch_first=True)
        # Run RNN
        packed_sequence_output, _ = rnn(packed_sequence_input, None)
        # Unpack hidden states
        unpacked_sequence_tensor, _ = pad_packed_sequence(
            packed_sequence_output, batch_first=True)
        # Restore the original order in the batch and return all hidden states
        return unpacked_sequence_tensor.index_select(0, restoration_indices)

    def forward(self, batch):
        # Obtain masks and lengths for passage and question.
        passage_mask = (batch['passages'] != self.pad_token_id
                        )  # [batch_size, p_len]
        question_mask = (batch['questions'] != self.pad_token_id
                         )  # [batch_size, q_len]
        passage_lengths = passage_mask.long().sum(-1)  # [batch_size]
        question_lengths = question_mask.long().sum(-1)  # [batch_size]

        # 1) Embedding Layer: Embed the passage and question.
        passage_embeddings = self.embedding(
            batch['passages'])  # [batch_size, p_len, p_dim]
        question_embeddings = self.embedding(
            batch['questions'])  # [batch_size, q_len, q_dim]

        passage_char_embeddings = self.char_embedding(
            batch['char_passages']
        )  # [batch_size, p_len, word_length, word_dim] [64, 168, 16, 64]
        question_char_embeddings = self.char_embedding(
            batch['char_questions']
        )  # [batch_size, q_len, word_length, word_dim]

        if self.args.char_embedding_type == 'average':
            # Average char embeddings baseline
            passage_char_embeddings_avg = passage_char_embeddings.mean(
                dim=2).squeeze(0)
            question_char_embeddings_avg = question_char_embeddings.mean(
                dim=2).squeeze(0)

            passage_final_embeddings = torch.cat(
                [passage_embeddings, passage_char_embeddings_avg], dim=2)
            question_final_embeddings = torch.cat(
                [question_embeddings, question_char_embeddings_avg], dim=2)
            #print('passage_char_embeddings ', passage_char_embeddings.shape)
            #print('question_char_embeddings ', question_char_embeddings.shape)

        else:
            # Conv 1D char embeddings
            passage_char_embeddings_conv1d_input = passage_char_embeddings.reshape(
                (-1, passage_char_embeddings.shape[3],
                 passage_char_embeddings.shape[2]))
            question_char_embeddings_conv1d_input = question_char_embeddings.reshape(
                (-1, question_char_embeddings.shape[3],
                 question_char_embeddings.shape[2]))

            conv1d = torch.nn.Conv1d(self.args.char_embedding_dim,
                                     self.args.char_embedding_dim, 3)
            relu = torch.nn.ReLU()

            if torch.cuda.is_available():
                conv1d.cuda()
                relu.cuda()

            passage_char_embeddings_tmp1 = relu(
                conv1d(passage_char_embeddings_conv1d_input))
            # Last dimension of conv1d output we want to collapse using global max pooling
            passage_char_embeddings_final = torch.nn.functional.max_pool1d(
                passage_char_embeddings_tmp1,
                passage_char_embeddings_tmp1.shape[2]).squeeze(2).reshape(
                    passage_char_embeddings.shape[0],
                    passage_char_embeddings.shape[1], -1)

            question_char_embeddings_tmp1 = relu(
                conv1d(question_char_embeddings_conv1d_input))
            # Last dimension of conv1d output we want to collapse using global max pooling
            question_char_embeddings_final = torch.nn.functional.max_pool1d(
                question_char_embeddings_tmp1,
                question_char_embeddings_tmp1.shape[2]).squeeze(2).reshape(
                    question_char_embeddings.shape[0],
                    question_char_embeddings.shape[1], -1)

            passage_final_embeddings = torch.cat(
                [passage_embeddings, passage_char_embeddings_final], dim=2)
            question_final_embeddings = torch.cat(
                [question_embeddings, question_char_embeddings_final], dim=2)

        # 2) Context2Query: Compute weighted sum of question embeddings for
        #        each passage word and concatenate with passage embeddings.
        aligned_scores = self.aligned_att(
            passage_final_embeddings, question_final_embeddings,
            ~question_mask)  # [batch_size, p_len, q_len]
        aligned_embeddings = aligned_scores.bmm(
            question_embeddings)  # [batch_size, p_len, q_dim]
        passage_embeddings = cuda(
            self.args,
            torch.cat((passage_embeddings, aligned_embeddings), 2),
        )  # [batch_size, p_len, p_dim + q_dim]

        # 3) Passage Encoder
        passage_hidden = self.sorted_rnn(
            passage_embeddings, passage_lengths,
            self.passage_rnn)  # [batch_size, p_len, p_hid]
        passage_hidden = self.dropout(
            passage_hidden)  # [batch_size, p_len, p_hid]

        # 4) Question Encoder: Encode question embeddings.
        question_hidden = self.sorted_rnn(
            question_embeddings, question_lengths,
            self.question_rnn)  # [batch_size, q_len, q_hid]

        # 5) Question Attentive Sum: Compute weighted sum of question hidden
        #        vectors.
        question_scores = self.question_att(question_hidden, ~question_mask)
        question_vector = question_scores.unsqueeze(1).bmm(
            question_hidden).squeeze(1)
        question_vector = self.dropout(question_vector)  # [batch_size, q_hid]

        # 6) Start Position Pointer: Compute logits for start positions
        start_logits = self.start_output(passage_hidden, question_vector,
                                         ~passage_mask)  # [batch_size, p_len]

        # 7) End Position Pointer: Compute logits for end positions
        end_logits = self.end_output(passage_hidden, question_vector,
                                     ~passage_mask)  # [batch_size, p_len]

        return start_logits, end_logits  # [batch_size, p_len], [batch_size, p_len]
from solver import Solver
from preprocess.tacotron.utils import spectrogram2wav
#from preprocess.tacotron.audio import inv_spectrogram, save_wav
from scipy.io.wavfile import write
from preprocess.tacotron.mcep import mc2wav

if __name__ == '__main__':
    feature = 'sp'
    hps = Hps()
    hps.load('./hps/v19.json')
    hps_tuple = hps.get_tuple()
    solver = Solver(hps_tuple, None)
    solver.load_model('/storage/model/voice_conversion/v19/model.pkl-59999')
    if feature == 'mc':
        # indexer to extract data
        indexer = Indexer()
        src_mc = indexer.index(speaker_id='225',
                               utt_id='366',
                               dset='test',
                               feature='norm_mc')
        tar_mc = indexer.index(speaker_id='226',
                               utt_id='366',
                               dset='test',
                               feature='norm_mc')
        expand_src_mc = np.expand_dims(src_mc, axis=0)
        expand_tar_mc = np.expand_dims(tar_mc, axis=0)
        src_mc_tensor = torch.from_numpy(expand_src_mc).type(torch.FloatTensor)
        tar_mc_tensor = torch.from_numpy(expand_tar_mc).type(torch.FloatTensor)
        c1 = Variable(torch.from_numpy(np.array([0]))).cuda()
        c2 = Variable(torch.from_numpy(np.array([1]))).cuda()
        results = [src_mc]
Exemple #15
0
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import json
import pandas as pd
from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import classification_report

include_test = True

tknr = TweetTokenizer()
indexer = get_indexer('indexer_15_dups.csv')
word_indexer = Indexer()
word_indexer.add_and_get_index("UNK")

train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000]
dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200]
test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200]

X_train = []
Y_train = []
X_dev = []
Y_dev = []
Y_dev_true = []
X_test = []
Y_test = []
Y_test_true = []
Exemple #16
0
 def __init__(self):
     self.indexer = Indexer()
Exemple #17
0
def generate_papers(datafile, feature_begin, feature_end, observation_begin,
                    observation_end, conf_list):
    logging.info('generating papers ...')

    # try:
    #     result = pickle.load(open('dblp/data/papers_%s.pkl' % path, 'rb'))
    #     return result
    # except IOError:
    #     pass

    indexer = Indexer(['author', 'paper', 'term', 'venue'])

    index, authors, title, year, venue = None, None, None, None, None
    references = []

    write = 0
    cite = 0
    include = 0
    published = 0

    min_year = 3000
    max_year = 0

    papers_feature_window = []
    papers_observation_window = []

    with open(datafile) as file:
        dataset = file.read().splitlines()

    for line in dataset:
        if not line:
            if year and venue:
                year = int(year)
                if year > 0 and authors and venue in conf_list:
                    min_year = min(min_year, year)
                    max_year = max(max_year, year)
                    authors = authors.split(',')
                    terms = parse_term(title)
                    write += len(authors)
                    cite += len(references)
                    include += len(terms)
                    published += 1

                    p = Paper(year)
                    if feature_begin < year <= feature_end:
                        p.id = indexer.index('paper', index)
                        p.terms = [
                            indexer.index('term', term) for term in terms
                        ]
                        p.references = [
                            indexer.index('paper', paper_id)
                            for paper_id in references
                        ]
                        p.authors = [
                            indexer.index('author', author_name)
                            for author_name in authors
                        ]
                        p.venue = indexer.index('venue', venue)
                        bisect.insort(papers_feature_window, p)
                    elif observation_begin < year <= observation_end:
                        p.references = references
                        p.authors = authors
                        papers_observation_window.append(p)

            index, authors, title, year, venue = None, None, None, None, None
            references = []
        else:
            begin = line[1]
            if begin == '*':
                title = line[2:]
            elif begin == '@':
                authors = line[2:]
            elif begin == 't':
                year = line[2:]
            elif begin == 'c':
                venue = line[2:]
            elif begin == 'i':
                index = line[6:]
            elif begin == '%':
                references.append(line[2:])

    for p in papers_observation_window:
        authors = []
        references = []
        for author in p.authors:
            author_id = indexer.get_index('author', author)
            if author_id is not None:
                authors.append(author_id)
        for ref in p.references:
            paper_id = indexer.get_index('paper', ref)
            if paper_id is not None:
                references.append(paper_id)
        p.authors = authors
        p.references = references

    with open('dblp/data/metadata_%s.txt' % path, 'w') as output:
        output.write('Nodes:\n')
        output.write('-----------------------------\n')
        output.write('#Authors: %d\n' % indexer.indices['author'])
        output.write('#Papers: %d\n' % indexer.indices['paper'])
        output.write('#Venues: %d\n' % indexer.indices['venue'])
        output.write('#Terms: %d\n\n' % indexer.indices['term'])
        output.write('\nEdges:\n')
        output.write('-----------------------------\n')
        output.write('#Write: %d\n' % write)
        output.write('#Cite: %d\n' % cite)
        output.write('#Publish: %d\n' % published)
        output.write('#Contain: %d\n' % include)
        output.write('\nTime Span:\n')
        output.write('-----------------------------\n')
        output.write('From: %s\n' % min_year)
        output.write('To: %s\n' % max_year)

    result = papers_feature_window, papers_observation_window, indexer.indices
    # pickle.dump(result, open('dblp/data/papers_%s.pkl' % path, 'wb'))
    return result
Exemple #18
0
def generate_indexer(user_rates_movies_ds, user_tags_movies_ds, movie_actor_ds,
                     movie_director_ds, movie_genre_ds, movie_countries_ds,
                     feature_begin, feature_end):
    logging.info('generating indexer ...')
    min_time = 1e30
    max_time = -1
    indexer = Indexer(
        ['user', 'tag', 'movie', 'actor', 'director', 'genre', 'country'])

    for line in user_rates_movies_ds[1:]:
        line_items = line.split('\t')
        rating_timestamp = float(line_items[3]) / 1000
        min_time = min(min_time, rating_timestamp)
        max_time = max(max_time, rating_timestamp)
        rating = float(line_items[2])
        if feature_begin < rating_timestamp <= feature_end and rating > rating_threshold:
            indexer.index('user', line_items[0])
            indexer.index('movie', line_items[1])

    for line in user_tags_movies_ds[1:]:
        line_items = line.split('\t')
        tag_timestamp = float(line_items[3]) / 1000
        if feature_begin < tag_timestamp <= feature_end:
            indexer.index('user', line_items[0])
            indexer.index('movie', line_items[1])
            indexer.index('tag', line_items[2])

    for line in movie_actor_ds[1:]:
        line_items = line.split('\t')
        ranking = int(line_items[3])
        if ranking < actor_threshold and line_items[0] in indexer.mapping[
                'movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('actor', line_items[1])

    for line in movie_director_ds[1:]:
        line_items = line.split('\t')
        if line_items[0] in indexer.mapping['movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('director', line_items[1])

    for line in movie_genre_ds[1:]:
        line_items = line.split('\t')
        if line_items[0] in indexer.mapping['movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('genre', line_items[1])

    for line in movie_countries_ds[1:]:
        line_items = line.split('\t')
        if line_items[0] in indexer.mapping['movie']:
            # indexer.index('movie', line_items[0])
            indexer.index('country', line_items[1])

    with open('movielens/data/metadata.txt', 'w') as output:
        output.write('Nodes:\n')
        output.write('-----------------------------\n')
        output.write('#Users: %d\n' % indexer.indices['user'])
        output.write('#Tags: %d\n' % indexer.indices['tag'])
        output.write('#Movies: %d\n' % indexer.indices['movie'])
        output.write('#Actors: %d\n' % indexer.indices['actor'])
        output.write('#Director: %d\n' % indexer.indices['director'])
        output.write('#Genre: %d\n' % indexer.indices['genre'])
        output.write('#Countriy: %d\n' % indexer.indices['country'])
        output.write('\nEdges:\n')
        output.write('-----------------------------\n')
        output.write('#Rate: %d\n' % len(user_rates_movies_ds))
        output.write('#Attach: %d\n' % len(user_tags_movies_ds))
        output.write('#Played_by: %d\n' % len(movie_actor_ds))
        output.write('#Directed_by : %d\n' % len(movie_director_ds))
        output.write('#Has: %d\n' % len(movie_genre_ds))
        output.write('#Produced_in: %d\n' % len(movie_countries_ds))
        output.write('\nTime Span:\n')
        output.write('-----------------------------\n')
        output.write('From: %s\n' % datetime.fromtimestamp(min_time))
        output.write('To: %s\n' % datetime.fromtimestamp(max_time))

    return indexer
Exemple #19
0
class CNN(nn.Module):
    def __init__(self, args, reduced_size=None, info={}):
        super(CNN, self).__init__()
        # disc_type=DISC_TYPE_MATRIX
        self.disc_type = disc_type = args.disc_type
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 4, kernel_size=2, padding=0),
            nn.ReLU())
        # 1,4,3,3
        self.layer2 = nn.Sequential(
            nn.Conv2d(4, 8, kernel_size=2),
            nn.ReLU())
        # 1,8,2,2
        ## but for 5 lines, it is 1,8,3,3
        if args.data_type == "sonnet_endings":
            self.scorer = nn.Linear(2 * 2 * 8, 1)
        elif args.data_type == "limerick":
            self.scorer = nn.Linear(3 * 3 * 8, 1)
        self.predictor = nn.Sigmoid()
        self.args = args
        self.use_cuda = args.use_cuda

        ##
        self.g_indexer = Indexer(args)
        self.g_indexer.load('tmp/tmp_' + args.g2p_model_name + '/solver_g_indexer')
        self.g2pmodel = Model(H=info['H'], args=args, i_size=self.g_indexer.w_cnt, o_size=self.g_indexer.w_cnt,
                              start_idx=self.g_indexer.w2idx[utils.START])
        if not args.learn_g2p_encoder_from_scratch:
            print("=====" * 7, "LOADING g2p ENCODER PRETRAINED")
            model_dir = 'tmp/tmp_' + args.g2p_model_name + '/'
            state_dict_best = torch.load(model_dir + 'model_best')
            self.g2pmodel.load_state_dict(state_dict_best)
        if not args.trainable_g2p:
            assert not args.learn_g2p_encoder_from_scratch
            for param in self.g2pmodel.parameters():
                param.requires_grad = False

    def display_params(self):
        print("=" * 44)
        print("[CNN]: model parametrs")
        for name, param in self.named_parameters():
            print("name=", name, " || grad:", param.requires_grad, "| size = ", param.size())
        print("=" * 44)

    def _compute_word_reps(self, words_str, deb=False):
        if deb:
            print("words_str = ", words_str)
        use_eow_marker = self.args.use_eow_in_enc
        assert not use_eow_marker, "Not yet tested"
        word_reps = [self.g_indexer.w_to_idx(s1) for s1 in words_str]
        if self.args.use_eow_in_enc:
            x_end = self.g_indexer.w2idx[utils.END]
            word_reps = [x_i + [x_end] for x_i in word_reps]
        word_reps = [self.g2pmodel.encode(w) for w in word_reps]
        return word_reps

    def _compute_pairwise_dot(self, measure_encodings_b):
        ret = []
        sz = len(measure_encodings_b)
        for measure_encodings_b_t in measure_encodings_b:
            for measure_encodings_b_t2 in measure_encodings_b:
                t1 = torch.sum(measure_encodings_b_t * measure_encodings_b_t2)
                t2 = torch.sqrt(torch.sum(measure_encodings_b_t * measure_encodings_b_t))
                t3 = torch.sqrt(torch.sum(measure_encodings_b_t2 * measure_encodings_b_t2))
                assert t2 > 0
                assert t3 > 0, "t3=" + str(t3)
                ret.append(t1 / (t2 * t3))
        ret = torch.stack(ret)
        ret = ret.view(sz, sz)
        return ret

    def _score_matrix(self, x, deb=False):
        x = x[0].unsqueeze(0).unsqueeze(0)  # -> 1,1,ms,ms
        if deb:
            print("---x.shape = ", x.size())
        out = self.layer1(x)
        if deb:
            print("---out = ", out.size(), out)
        out = self.layer2(out)
        if deb:
            print("---out = ", out.size(), out)
        out = out.view(out.size(0), -1)  # arrange by bsz
        score = self.scorer(out)
        if deb:
            print("---out sum = ", torch.sum(out))
            print("---score = ", score)
        prob = self.predictor(score)
        return {'prob': prob, 'out': out, 'score': score}

    def _compute_rhyming_matrix(self, words_str, deb=False):
        word_reps = self._compute_word_reps(words_str)
        rhyming_matrix = self._compute_pairwise_dot(word_reps)
        return rhyming_matrix, words_str

    def _compute_rnn_on_word_reps(self, word_reps):
        h = torch.zeros(1, self.linear_rep_H), torch.zeros(1, self.linear_rep_H)
        if self.use_cuda:
            h = h[0].cuda(), h[1].cuda()
        for w in word_reps:
            h = self.linear_rep_encoder(w, h)
        out, c = h
        return c

    def _run_discriminator(self, words_str, deb):
        rhyming_matrix, words_str = self._compute_rhyming_matrix(words_str, deb)
        vals = self._score_matrix([rhyming_matrix])
        vals.update({'rhyming_matrix': rhyming_matrix, 'linear_rep': None, 'words_str': words_str})
        return vals

    def update_discriminator(self, line_endings_gen, line_endings_train, deb=False, word_idx_to_str_dict=None):
        eps = 0.0000000001
        ret = {}
        dump_info = {}
        words_str_train = [word_idx_to_str_dict[word_idx.data.cpu().item()] for word_idx in line_endings_train]
        words_str_gen = [word_idx_to_str_dict[word_idx.data.cpu().item()] for word_idx in line_endings_gen]
        disc_real = self._run_discriminator(words_str_train, deb)
        if deb:
            print("rhyming_matrix_trai = ", disc_real['rhyming_matrix'], "|| prob = ", disc_real['prob'])
            if self.args.disc_type == DISC_TYPE_MATRIX:
                dump_info['rhyming_matrix_trai'] = disc_real['rhyming_matrix'].data.cpu().numpy()
            dump_info['real_prob'] = disc_real['prob'].data.cpu().item()
            dump_info['real_words_str'] = disc_real['words_str']
        disc_gen = self._run_discriminator(words_str_gen, deb)
        if deb:
            print("rhyming_matrix_gen = ", disc_gen['rhyming_matrix'], "|| prob = ", disc_gen['prob'])
            if self.args.disc_type == DISC_TYPE_MATRIX:
                dump_info['rhyming_matrix_gen'] = disc_gen['rhyming_matrix'].data.cpu().numpy()
            dump_info['gen_prob'] = disc_gen['prob'].data.cpu().item()
            dump_info['gen_words_str'] = disc_gen['words_str']
        prob_real = disc_real['prob']
        prob_gen = disc_gen['prob']
        loss = -torch.log(prob_real + eps) - torch.log(1.0 - prob_gen + eps)
        reward = prob_gen
        if self.args.use_score_as_reward:
            reward = disc_gen['score']
        ret.update({'loss': loss, 'reward': reward, 'dump_info': dump_info})
        return ret
Exemple #20
0
 def __init__(self, list_file):
   self.logger = Logger.get_logger(utils.get_fullname(self))
   self.list_file = list_file
   self.indexer = Indexer()
   self._client = None
reload(sys)
sys.setdefaultencoding('utf-8')

app = Flask(__name__)
app.secret_key = b'_5#y2L"F4Qas5nb113@&B#(V!*#8z\n\xec]/'

db = Database()

recommender = Recommender()

if not db.checkConnectivity():
    print 'Unable to connect to database'
    sys.exit(-1)

indexer = Indexer()


@app.before_request
def authenticateUser():
    if request.endpoint != 'search' and request.endpoint != 'signIn' and 'userid' not in session:
        return redirect(url_for('signIn'))


@app.route('/signout')
def signOut():
    session.clear()
    return redirect(url_for('signIn'))


@app.route('/signin', methods=['GET', 'POST'])