Example #1
0
    def write_csv(self, path=None, mode='a'):
        """ Write the collected books information to a given CSV file
            Append if the file already exists

        Parameters
        ----------
        path : str (default is the category_name)
            The path including the file name (without the extension to the csv)
        mode : str (default is 'a')
            The file mode used to open the file (r,r+,w,w+,a,a+,x,x+)
        """

        if self.books == []:
            self.collect()

        if path is None:
            path = self.name.lower().replace(' ', '_')

        fields = self.books[0].get_headers()
        headers = {fields[i]: fields[i] for i in range(len(fields))}

        if not os.path.exists(f'{path}.csv') or (mode != 'a' and mode != 'a+'):
            FileIO.write(path, fields, headers, mode)

        for book in self.books:
            FileIO.write(path, fields, book.to_dict(), 'a')
Example #2
0
    def __scrap_books(self):
        books = []
        book = Book(self.links[0][0]).collect()

        FileIO.open_category(self.name)

        for link in self.links:

            progress_monitor.catbooks_update(
                    len(books),
                    self.num_books,
                    link[1])

            book = Book(link[0])
            book.collect()
            books.append(book)

            if self.dl_image:
                book.save_image()

        progress_monitor.catbooks_update(
                    len(books),
                    self.num_books,
                    link[1])

        FileIO.close_category()

        return books
Example #3
0
 def test_open_category(self):
     catname = 'testcat'
     FileIO.open_category(catname)
     assert getcwd() == urljoin(self.cwd+'/', catname)
     chdir('..')
     assert path.exists(catname) is True
     rmdir(catname)
Example #4
0
 def test_init_root(self):
     dirname = 'testinit'
     assert getcwd() == self.cwd
     FileIO.init_root(dirname, False)
     assert getcwd() == urljoin(self.cwd+'/', dirname)
     chdir('..')
     assert path.exists(dirname) is True
     rmdir(dirname)
Example #5
0
 def test_close_category(self):
     dirname = "testclose"
     assert getcwd() == self.cwd
     mkdir(dirname)
     chdir(dirname)
     FileIO.close_category()
     assert getcwd() == self.cwd
     rmdir(dirname)
Example #6
0
  def scan(self, ctx, prev_num):
    self.compute_stats()
    #
    # Check if we have encountered this file during this scan already
    #
    ctx.num_visited_files_reporter.increment(1)
    ctx.current_scanned_file_reporter.set(self.path())

    if self.scan_hlink(ctx):
      logging.info("File %s: HLINK" % self.path())
      return

    #
    # Check if the file is the same as in one of the upper levels
    #
    if self.scan_prev(ctx, prev_num):
      logging.debug("File %s: PREV" % self.path())
      ctx.num_prev_files_reporter.increment(1)
      return
    
    # --- File not yet in database, process it
    file_size = 0
    packer = PackerStream.PackerOStream(self.backup, Container.CODE_DATA)
    handle = open(self.path(), "rb")
    for data in FileIO.read_blocks(handle, self.backup.get_block_size()):
      packer.write(data)
      file_size += len(data)
      ctx.num_total_blocks_reporter.increment(1)
      ctx.size_total_blocks_reporter.increment(len(data))
      ctx.update_scan_status()
    handle.close()
      
    self.digest = packer.get_digest()
    self.level = packer.get_level()
    self.update_hlink(ctx)

    logging.info("Scanned file %s size:%d new_blocks:%d new_blocks_size:%d" %
        (self.path(), file_size, packer.get_num_new_blocks(),
          packer.get_size_new_blocks()))

    ctx.num_scanned_files_reporter.increment(1)
    if packer.get_num_new_blocks() != 0:
      ctx.num_new_blocks_reporter.increment(packer.get_num_new_blocks())
      ctx.size_new_blocks_reporter.increment(packer.get_size_new_blocks())
      ctx.num_changed_files_reporter.increment(1)
      ctx.changed_files_reporter.append(self.path())

    if file_size > 256 * 1024:
      logging.debug("File %s is big enough to register in cndb" %
          self.path())
      cndb = self.backup.get_completed_nodes_db()
      assert self.stats is not None
      path_digest = Digest.dataDigest(self.path().encode('utf8'))
      encoded = (self.digest +
          IntegerEncodings.binary_encode_int_varlen(self.level) +
          IntegerEncodings.binary_encode_int_varlen(self.get_type()) +
          serialize_stats(self.get_stats()))

      if not cndb.has_key(path_digest) or cndb[path_digest] != encoded:
        cndb[path_digest] = encoded
Example #7
0
  def restore(self, ctx):
    """
    Recreate the data from the information stored in the
    backup
    """
    
    logging.info("Restoring " + self.path())
    #
    # Check if the file has already been processed
    # during this pass
    #
    if self.restore_hlink(ctx):
      return

    #
    # No, this file is new. Create it.
    #
    packer = PackerStream.PackerIStream(self.backup, self.digest,
      self.level)
    file = open(self.path(), "wb")
    for data in FileIO.read_blocks(packer, Digest.dataDigestSize()):
      #print "File", self.path(), "reading digest",
      #    base64.b64encode(digest)
      file.write(data)
    file.close()
    
    self.restore_stats()
Example #8
0
    def collect(self):
        """ Connect to the home-page and grab the information """

        self._soup = FileIO.connect_with_bs4(self.site_url)

        self.num_books = self.__scrap_num_books()
        self.links = self.__scrap_links()
        self.categories = self.__scrap_categories()
Example #9
0
    def collect(self):
        """ Connect to the category page and grab the information """

        self._soup = FileIO.connect_with_bs4(self.category_url)

        self.name = self.__scrap_name()
        self.num_books = self.__scrap_num_books()
        self.links = self.__scrap_links()
        self.books = self.__scrap_books()
Example #10
0
 def retrieve(self, stream):
   """
   Recreate the data from the information stored in the backup into the given
   stream
   """
   logging.info("Retrieving file " + self.path())
   packer = PackerStream.PackerIStream(self.backup, self.digest,
       self.level)
   for data in FileIO.read_blocks(packer, Digest.dataDigestSize()):
     stream.write(data)
Example #11
0
 def test(self, ctx):
   """
   Test that loading the data from the storages is successful
   """
   logging.info("Testing " + self.path())
   packer = PackerStream.PackerIStream(self.backup, self.digest,
     self.level)
   for data in FileIO.read_blocks(packer, Digest.dataDigestSize()):
     # Do nothing with the data, just make sure it got loaded
     pass
Example #12
0
def readfile():
    logger.info("Hitting URL %s", request.url)

    rawContent = None
    filePath = request.args.get("file_path")
    error, response = FileIO.readFile(filePath)

    if error:
        response = None

    logger.debug("Error: %s", error)
    logger.debug("Response: %s", response)
    return jsonify(response=response, error=error)
Example #13
0
    def collect(self):
        """ Connect to the product page and grab the information """

        self._soup = FileIO.connect_with_bs4(self.product_page_url)

        self.universal_product_code = self.__scrap_upc()
        self.title = self.__scrap_title()
        self.price_including_tax = self.__scrap_price_inc_tax()
        self.price_excluding_tax = self.__scrap_price_exc_tax()
        self.number_available = self.__scrap_number_available()
        self.product_description = self.__scrap_product_description()
        self.category = self.__scrap_category()
        self.review_rating = self.__scrap_review_rating()
        self.image_url = self.__scrap_image_url()
Example #14
0
    def __scrap_links(self):
        def get_links(soup): return soup.select('section a[title]')

        try:
            links = get_links(self._soup)

            page = 2
            while(len(links) < self.num_books):
                base = urljoin(self.category_url, 'page-{}.html'.format(page))
                soup = FileIO.connect_with_bs4(base)
                links.extend(get_links(soup))
                page += 1

            return [(urljoin(self.category_url, x.attrs['href']),
                     x.attrs['title']) for x in links]
        except Exception:
            raise(Exception(f"Can't find the Book links ::\
                    \n{self.product_page_url}"))
Example #15
0
    def __scrap_categories(self, to_csv=False):

        FileIO.init_root('data', False)
        categories = []

        progress_monitor.allbooks_init(self.num_books, self.site_url)

        for link in self.links:

            progress_monitor.category_update(
                    len(categories),
                    len(self.links),
                    link[1])

            category = Category(link[0])
            categories.append(category)

            FileIO.open_category(category.name)
            category.write_csv()
            FileIO.close_category()

        return categories
Example #16
0
 def save_image(self):
     """ Copy the remote image in the current local directory """
     self.image_local = self.__get_image_name()
     FileIO.download_image(self.image_url, self.image_local)
Example #17
0
def test_connect_with_bs4_ERROR():

    with pytest.raises(Exception):
        FileIO.connect_with_bs4('http://www.xxxfakexxx.xxx')
Example #18
0
def dataset_read(source,
                 target,
                 batch_size,
                 is_resize=False,
                 leave_one_num=-1,
                 dataset='NW',
                 sensor_num=0):
    S_train = {}
    S_val = {}
    S_test = {}
    T_train = {}
    T_val = {}
    T_test = {}

    if 'NW' == dataset:
        x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
        x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \
        FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', X_dim = 4,
                              is_resize = is_resize,
                              leave_one_num = leave_one_num,
                              sensor_num = sensor_num)
    elif 'UCI' == dataset:
        x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
        x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \
        FileIO.load_UCI_mat(data_path = 'data/1_dataset_UCI_DSADS/Features/',
                        feature_length = 6*45, X_dim = 4,
                        is_resize = is_resize, leave_one_num = leave_one_num,
                        sensor_num = sensor_num)

    S_train['imgs'] = x_s_train
    S_train['labels'] = y_s_train
    T_train['imgs'] = x_t_train
    T_train['labels'] = y_t_train

    # input target samples for both
    S_val['imgs'] = x_s_val
    S_val['labels'] = y_s_val
    T_val['imgs'] = x_t_val
    T_val['labels'] = y_t_val

    S_test['imgs'] = x_s_test
    S_test['labels'] = y_s_test

    T_test['imgs'] = x_t_test
    T_test['labels'] = y_t_test

    train_loader = UnalignedDataLoader()
    train_loader.initialize(S_train, T_train, batch_size, batch_size)
    # train_loader.initialize(T_train, S_train, batch_size, batch_size)
    data_train = train_loader.load_data()

    test_loader = UnalignedDataLoader()
    test_loader.initialize(S_val, T_val, batch_size, batch_size)
    # test_loader.initialize(T_val, S_val, batch_size, batch_size)
    data_val = test_loader.load_data()

    final_test_loader = UnalignedDataLoader()
    final_test_loader.initialize(S_test, T_test, batch_size, batch_size)
    # final_test_loader.initialize(T_test, S_test, batch_size, batch_size)
    data_test = final_test_loader.load_data()
    print('Target test shape: {}'.format(T_test['labels'].shape))
    return data_train, data_val, data_test
Example #19
0
# -*- coding: utf-8 -*-
"""
Created on Mon May 20 10:45:51 2019

@author: kuangen
"""

from utils import FileIO
from utils import utils
import mat4py as m4p
import numpy as np
from numpy import genfromtxt
from sklearn.model_selection import train_test_split
#%% Northwestern dataset
idx_x = np.arange(0,368)
FileIO.save_mat('0_dataset/AB_156_to_186_walking.mat', is_walking = True)

#%%
x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \
FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', is_resize = True, 
                      leave_one_num = 1)
#%% UCI DSADS datase
# read data:[label,subjects,segments, time, sensors]
x_mat, y_mat = FileIO.read_UCI_DSADS()
FileIO.save_UCI_DSADS(x_mat, y_mat, file_path = 'data/1_dataset_UCI_DSADS/Raw/')
#%% extract features and output data
x_mat = utils.extract_UCI_features(x_mat)
FileIO.save_UCI_DSADS(x_mat, y_mat, file_path = 'data/1_dataset_UCI_DSADS/Features/')
#%% load UCI data
x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
Example #20
0
def test_connect_with_bs4_TYPE():
    url = 'http://books.toscrape.com'
    assert type(FileIO.connect_with_bs4(url)) == BeautifulSoup
Example #21
0
 def load(self, filepath):
     self._Q = FileIO.read_pkl(filepath)
def predict():
    from utils import FileIO, Utils
    from word_process import WordProcess

    # Path to the data txt file on disk.
    path_base = '../data/'
    path_file = path_base + 'bytecup.corpus.train.0.50k.txt'
    fio = FileIO()
    word = WordProcess(path_base, is_model_load=False, is_dict_load=True)

    contents, titles = fio.load_from_json(path_file)

    total_size = len(titles)
    num_samples = int(total_size * 0.8)
    num_test = total_size - num_samples
    print('num samples:', num_samples, 'num tests:', num_test)

    max_encoder_seq_length = int(max([len(txt) for txt in contents])) + 2
    max_decoder_seq_length = max([len(txt) for txt in titles]) + 2
    print('max_lengths:', max_encoder_seq_length, '  ', max_decoder_seq_length)

    train_data = {
        'contents': contents[0:num_samples],
        'titles': titles[0:num_samples]
    }
    test_data = {
        'contents': contents[num_samples:total_size],
        'titles': titles[num_samples:total_size]
    }
    datasets = {
        'train':
        TextData2(train_data,
                  word.dic,
                  train_len=max_encoder_seq_length,
                  label_len=max_decoder_seq_length),
        'val':
        TextData2(test_data,
                  word.dic,
                  train_len=max_encoder_seq_length,
                  label_len=max_decoder_seq_length)
    }
    data_loads = {
        x: DataLoader(datasets[x],
                      batch_size=batch_size,
                      shuffle=True,
                      num_workers=15)
        for x in ['train', 'val']
    }

    encoder = Encoder3(voca_size=84031, embedd_size=128, hidden_size=256)
    decoder = AttnDecoder3(hidden_size=256, vocab_size=84031)
    if use_cuda:
        encoder.cuda()
        decoder.cuda()
    best_model = torch.load(path_base + './50k.1.best_model_wts')
    best_model = Utils().gpu_model_to_cpu_model(best_model)

    encoder.load_state_dict(best_model[0])
    decoder.load_state_dict(best_model[1])
    out = evaluate(encoder, decoder, datasets)

    file1 = open(path_base + '50k.1.predict', 'a')
    for i, o in enumerate(out):
        file1.write(str([word.dic[int(i)] for i in o.data[0]]))
        file1.write(str(test_data['titles'][i]) + '\n')
    file1.close()
    print('predict done!')
                i, 0:len(content)] = content_vec[0:max_encoder_seq_length]
            decoder_input_data[i, 0:len(title)] = title_vec
            decoder_target_data[i, 0:len(title) - 1] = title_vec[1:len(title)]

        yield ([encoder_input_data, decoder_input_data], decoder_target_data)


epochs = 50  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
GPUs = 2
num_encoder_tokens = 128
# Path to the data txt file on disk.
path_base = '../data1/'

# Vectorize the data.
fio = FileIO()
word = WordProcess(path_base, is_model_load=True)
wv = word.wv
contents = fio.list_read(path_base + 'bytecup.corpus.train.0.contents.txt',
                         is_flatten=True,
                         is_return=True)
titles = fio.list_read(path_base + 'bytecup.corpus.train.0.titles.txt',
                       is_flatten=False,
                       is_return=True)

total_size = len(titles)
num_samples = int(total_size * 0.8)
num_test = total_size - num_samples
train_data = [contents[0:num_samples], titles[0:num_samples]]
test_data = [contents[num_samples:total_size], titles[num_samples:total_size]]
Example #24
0
        cat1.write_csv('cat1')
        cat1.write_csv('cat1')
        progress_monitor.complete()

    elif(args.slide == 3):
        # play with Scraper class
        print("This runs the whole website scraping")
        print("You can check the generated files in demo/slide3")

        move_to_path('demo/slide3')

        site_url = 'http://books.toscrape.com'
        site = Scraper(site_url)
        progress_monitor.complete()

    elif(args.slide == 4):
        # play with FileIO class
        print("This scrape an image")
        print("You can check the generated files in demo/slide4")

        move_to_path('demo/slide4')

        image_url = 'http://books.toscrape.com/media/cache/a3/9e/a39e7c5c9fc61c2ae0f81116aa8cbb0e.jpg'
        FileIO.download_image(image_url, 'demo.jpg')

    else:
        # Scrap the website
        site_url = 'http://books.toscrape.com'
        site = Scraper(site_url)
        progress_monitor.complete()
Example #25
0
 def test_download_image(self):
     url = "http://books.toscrape.com/media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg"
     name = "testdownload.jpg"
     FileIO.download_image(url, name)
     assert path.exists(name)
     remove(name)
Example #26
0
 def test_write(self):
     filepath = 'testwrite'
     FileIO.write(filepath, ['a'], {'a': 'hello'}, 'w')
     assert path.exists(f"{filepath}.csv") is True
     remove(f"{filepath}.csv")
Example #27
0
 def load(self, filepath):
     self._Q1 = FileIO.read_pkl(filepath)
     self._Q2 = np.copy(self._Q1)
Example #28
0
def traditional_har(dataset='UCI'):
    if 'UCI' == dataset:
        sub_num = 8
        class_num = 19
        feature_length = 6
        sensor_num = 45
    elif 'NW' == dataset:
        sub_num = 10
        class_num = 7
    acc_s_LDA = np.zeros(sub_num)
    acc_t_LDA = np.zeros(sub_num)
    acc_s_SVM = np.zeros(sub_num)
    acc_t_SVM = np.zeros(sub_num)
    acc_s_ANN = np.zeros(sub_num)
    acc_t_ANN = np.zeros(sub_num)
    for i in range(sub_num):
        # load UCI dataset
        if 'UCI' == dataset:
            x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
            x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \
            FileIO.load_UCI_mat(data_path = 'data/1_dataset_UCI_DSADS/Features/',
                            feature_length = feature_length*45, X_dim = 2,
                            leave_one_num = i)
            x_s_train = x_s_train[:, 0:feature_length * sensor_num]
            x_s_val = x_s_val[:, 0:feature_length * sensor_num]
            x_s_test = x_s_test[:, 0:feature_length * sensor_num]
            x_t_train = x_t_train[:, 0:feature_length * sensor_num]
            x_t_val = x_t_val[:, 0:feature_length * sensor_num]
            x_t_test = x_t_test[:, 0:feature_length * sensor_num]

        # load NW dataset
        elif 'NW' == dataset:
            x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
            x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \
            FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', X_dim = 2,
            leave_one_num = i)

        # print(y_s_train.shape[0] + y_s_val.shape[0] + y_s_test.shape[0],
        #       y_t_train.shape[0] + y_t_val.shape[0] + y_t_test.shape[0])
        # LDA, no domain adaptation
        clf = LDA()
        clf.fit(x_s_train, y_s_train)
        y_s_test_pred = clf.predict(x_s_test)

        start = time.clock()
        for i in range(8):
            out_prediction = clf.predict(x_s_test[[i]])
        end = time.clock()
        print('LDA: forward time for each segment:%.30f' %
              ((end - start) / 8.))

        acc = accuracy_score(y_s_test, y_s_test_pred)
        print("LDA: source domain accuracy: %.2f%%" % acc)
        acc_s_LDA[i] = acc

        y_t_test_pred = clf.predict(x_t_test)
        acc = accuracy_score(y_t_test, y_t_test_pred)
        print("LDA: target domain accuracy: %.2f%%" % (acc))
        acc_t_LDA[i] = acc

        # SVM, no domain adaptation
        clf = svm.LinearSVC(max_iter=5000)
        clf.fit(x_s_train, y_s_train)
        y_s_test_pred = clf.predict(x_s_test)

        start = time.clock()
        for i in range(8):
            out_prediction = clf.predict(x_s_test[[i]])
        end = time.clock()
        print('SVM: forward time for each segment:%.30f' %
              ((end - start) / 8.))

        acc = accuracy_score(y_s_test, y_s_test_pred)
        print("SVM: source domain accuracy: %.2f%%" % acc)
        acc_s_SVM[i] = acc

        y_t_test_pred = clf.predict(x_t_test)
        acc = accuracy_score(y_t_test, y_t_test_pred)
        print("SVM: target domain accuracy: %.2f%%" % (acc))
        acc_t_SVM[i] = acc

        #%% ANN, no domain adaptation

        # load UCI dataset
        if 'UCI' == dataset:
            x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
            x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \
            FileIO.load_UCI_mat(data_path = 'data/1_dataset_UCI_DSADS/Features/',
                                is_one_hot = True, is_normalized = True,
                            feature_length = feature_length*45,
                            X_dim = 2, leave_one_num = i)

        # load NW dataset
        if 'NW' == dataset:
            x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
            x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \
            FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', X_dim = 2,
                                  is_one_hot = True, is_normalized = True,
                                  leave_one_num = i)

        clf = MLPClassifier(solver='sgd',
                            activation='tanh',
                            learning_rate='adaptive',
                            learning_rate_init=0.1,
                            hidden_layer_sizes=(10, class_num),
                            max_iter=2000)
        clf.fit(x_s_train, y_s_train)
        y_s_test_pred = clf.predict(x_s_test)
        acc = accuracy_score(y_s_test, y_s_test_pred)

        start = time.clock()
        for i in range(8):
            out_prediction = clf.predict(x_s_test[[i]])
        end = time.clock()
        print('ANN: forward time for each segment:%.30f' %
              ((end - start) / 8.))

        print("ANN: source domain accuracy: %.2f%%" % acc)
        acc_s_ANN[i] = acc

        y_t_test_pred = clf.predict(x_t_test)
        acc = accuracy_score(y_t_test, y_t_test_pred)
        print("ANN: target domain accuracy: %.2f%%" % (acc))
        acc_t_ANN[i] = acc

    print('LDA: mean of test acc in the source domain:', np.mean(acc_s_LDA))
    print('LDA: mean of test acc in the target domain:', np.mean(acc_t_LDA))
    print('SVM: mean of test acc in the source domain:', np.mean(acc_s_SVM))
    print('SVM: mean of test acc in the target domain:', np.mean(acc_t_SVM))
    print('ANN: mean of test acc in the source domain:', np.mean(acc_s_ANN))
    print('ANN: mean of test acc in the target domain:', np.mean(acc_t_ANN))

    return np.transpose(np.c_[acc_s_LDA, acc_t_LDA, acc_s_SVM, acc_t_SVM,
                              acc_s_ANN, acc_t_ANN])
Example #29
0
 def load(self, filepath):
     self._Q = FileIO.read_pkl(filepath)
     self._pi.update_Q(self._Q)
     self._b.update_Q(self._Q)
def main():
    from utils import FileIO
    from word_process import WordProcess
    # Path to the data txt file on disk.
    path_base = '../data/'
    path_file = path_base + 'bytecup.corpus.train.0.50k.txt'
    fio = FileIO()
    word = WordProcess(path_base, is_model_load=False, is_dict_load=True)
    dic = word.dic
    contents, titles = fio.load_from_json(path_file)

    total_size = len(titles)
    num_samples = int(total_size * 0.8)
    num_test = total_size - num_samples
    print('num samples:', num_samples, 'num tests:', num_test)

    max_encoder_seq_length = int(max([len(txt) for txt in contents])) + 2
    max_decoder_seq_length = max([len(txt) for txt in titles]) + 2
    print('max_lengths:', max_encoder_seq_length, '  ', max_decoder_seq_length)

    train_data = {
        'contents': contents[0:num_samples],
        'titles': titles[0:num_samples]
    }
    test_data = {
        'contents': contents[num_samples:total_size],
        'titles': titles[num_samples:total_size]
    }
    datasets = {
        'train':
        TextData2(train_data,
                  dic,
                  train_len=max_encoder_seq_length,
                  label_len=max_decoder_seq_length),
        'val':
        TextData2(test_data,
                  dic,
                  train_len=max_encoder_seq_length,
                  label_len=max_decoder_seq_length)
    }
    data_loads = {
        x: DataLoader(datasets[x],
                      batch_size=batch_size,
                      shuffle=True,
                      num_workers=15)
        for x in ['train', 'val']
    }

    encoder = Encoder3(voca_size=84031, embedd_size=128, hidden_size=256)
    decoder = AttnDecoder3(hidden_size=256, vocab_size=84031)

    optimizer = optim.SGD([{
        'params': encoder.parameters(),
        'lr': 0.01
    }, {
        'params': decoder.parameters(),
        'lr': 0.01
    }],
                          lr=0.01,
                          momentum=0.9)

    lambda1 = lambda epoch: epoch // 30
    lambda2 = lambda epoch: 0.95**epoch
    scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                            lr_lambda=[lambda2, lambda2])
    criterion = {'loss': nn.CosineSimilarity(dim=2), 'acc': nn.MSELoss()}
    loss_history = HistoryLoss()
    train_model(encoder, decoder, data_loads, criterion, scheduler,
                loss_history)
Example #31
0
 def save(self, filepath):
     FileIO.dump_pkl(self._Q1, filepath)