Esempio n. 1
0
 def _generate_samples(self):
     filename = self.tree[self.id_file][0]
     file_in = detect_archive_format_and_open(filename)
     seek_unicode(file_in, self.start_offset)
     while True:
         for line in file_in:
             line = line.strip()
             yield line
         file_in.close()
         self.id_file += 1
         if self.id_file >= len(self.tree):
             self.id_file = 0
             self._cnt_restarts += 1
         file_in = detect_archive_format_and_open(self.tree[self.id_file][0])
Esempio n. 2
0
 def _generate_samples(self):
     for filename in self.base_corpus:
         with detect_archive_format_and_open(filename) as file_in:
             for line in file_in:
                 line = line.strip()
                 if line:
                     yield line
Esempio n. 3
0
 def _generate_samples(self):
     for i in range(self.start[0], self.end[0] + 1):
         filename = self.tree[i].filename
         with detect_archive_format_and_open(filename) as file_in:
             if i == self.start[0]:
                 # TODO: conside seek to beginning of line
                 seek_unicode(file_in, self.start[1])
             cnt_bytes_read = self.start[0] if self.start[0] == self.end[0] else 0
             for line in file_in:
                 cnt_bytes_read += len(line)
                 line = line.strip()
                 if cnt_bytes_read > self.end[1]:
                     break
                 yield line
Esempio n. 4
0
 def load_from_text(self, path):
     i = 0
     # self.name+="_"+os.path.basename(os.path.normpath(path))
     self.vocabulary = Vocabulary()
     rows = []
     header = False
     vec_size = -1
     with detect_archive_format_and_open(path) as file_in:
         for line_number, line in enumerate(file_in):
             tokens = line.split()
             if i == 0 and len(tokens) == 2:
                 header = True
                 cnt_words = int(tokens[0])
                 vec_size = int(tokens[1])
                 continue
             # word = tokens[0].decode('ascii',errors="ignore")
             # word = tokens[0].decode('UTF-8', errors="ignore")
             word = tokens[0]
             self.vocabulary.dic_words_ids[word] = i
             self.vocabulary.lst_words.append(word)
             str_vec = tokens[1:]
             if vec_size == -1:
                 vec_size = len(str_vec)
             if vec_size != len(str_vec):
                 warning_message = "input error in line {}, expected tokens: {}, read tokens: {}, line: {}  ".format(
                     line_number, vec_size, len(str_vec), line)
                 warnings.warn(warning_message)
                 continue
             row = np.zeros(len(str_vec), dtype=np.float32)
             for j in range(len(str_vec)):
                 row[j] = float(str_vec[j])
             rows.append(row)
             i += 1
     # if header:
     #     assert cnt_words == len(rows)
     self.matrix = np.vstack(rows)
     if header:
         assert vec_size == self.matrix.shape[1]
     self.vocabulary.lst_frequencies = np.zeros(len(
         self.vocabulary.lst_words),
                                                dtype=np.int32)
     self.name = os.path.basename(os.path.dirname(os.path.normpath(path)))