Esempio n. 1
0
 def read(self):
     for line in self.file_handler:
         self.current_count += 1
         line = line.strip()
         line_data = line.split(self.separator)
         if self.has_title:
             line_len = len(line_data)
             if self.title_len != line_len:
                 Log.warn("Line count wrong: Title: [%s] Data: [%s]" % (','.join(self.title), ','.join(line_data)))
                 continue
             line_data = dict(zip(self.title, line_data))
         yield line_data
Esempio n. 2
0
 def import_data(self, db, files, file_dir, out_dir):
     '''
     import data to mongo
     '''
     Log.info("IMPORT-FILE prepare save %r to %s" %(files, db))
     # begin save datas to db
     for f in files:
         with get_source_file(f) as source:
             Log.info("IMPORT-FILE begin process file [%s]" % f)
             if source is None:
                 Log.warn("IMPORT-FILE no such file [%s]" % f)
                 continue
             for data in source.read():
                 # get table field for save
                 table, data = self.get_save_table(data)
                 if table is None:
                     Log.warn(
                         "File [%s] has no [%s] or [%s] field, data: %r" \
                         %(f, self.KEY_TABLE, self.KEY_TMP_TABLE, data)
                     )
                     continue
                 data = self.convert_data(data)
                 if data is None:
                     continue
                 # get data id and save data each self.SAVE_LIMIT
                 data_id = data[self.KEY_ID]
                 if self.data_to_save.has_key(table):
                     self.data_to_save[table].append(data)
                 else:
                     self.data_to_save[table] = [data]
                 # global counter
                 if self.counter.has_key(table):
                     self.counter[table] += 1
                 else:
                     self.counter[table] = 1
                 # up to self.SAVE_LIMIT, save data
                 if self.counter[table] % self.SAVE_LIMIT == 0:
                     save_result = self.save(db, table, self.data_to_save[table])
                     if save_result is not None:
                         self.show_save_result(db, table, save_result)
                     else:
                         log.warn("save result is None! file: [%s] db: [%s], table: [%s], data: %r" %(f, db, table, self.data_to_save[table]))
                     self.data_to_save[table] = []
         # save left datas to db
         if len(self.data_to_save) > 0:
             for (table, data) in self.data_to_save.items():
                 if len(data) == 0: continue
                 save_result = self.save(db, table, data)
                 if save_result is not None:
                     self.show_save_result(db, table, save_result)
                 self.data_to_save[table] = []
         self.mv_files(db, f, file_dir, out_dir)
Esempio n. 3
0
 def convert_data(self, data):
     # get content and compress
     if data.has_key(self.KEY_CONTENT):
         unc_text = data[self.KEY_CONTENT].encode('utf-8')
         compressed = snappy.compress(unc_text)
         compressed = bson.binary.Binary(compressed)
         data[self.KEY_CONTENT] = compressed
     if data is None:
         Log.warn("Data is none..., file: %s" % f)
         return None
     # convert int field from string to int
     for int_field in self.INT_FILED_LIST:
         try:
             if data.has_key(int_field):
                 data[int_field] = int(float(data[int_field]))
         except:
             Log.warn("Data field convert int failed, data: %r" % data)
             data = None
     return data