def read(self): for line in self.file_handler: self.current_count += 1 line = line.strip() line_data = line.split(self.separator) if self.has_title: line_len = len(line_data) if self.title_len != line_len: Log.warn("Line count wrong: Title: [%s] Data: [%s]" % (','.join(self.title), ','.join(line_data))) continue line_data = dict(zip(self.title, line_data)) yield line_data
def import_data(self, db, files, file_dir, out_dir): ''' import data to mongo ''' Log.info("IMPORT-FILE prepare save %r to %s" %(files, db)) # begin save datas to db for f in files: with get_source_file(f) as source: Log.info("IMPORT-FILE begin process file [%s]" % f) if source is None: Log.warn("IMPORT-FILE no such file [%s]" % f) continue for data in source.read(): # get table field for save table, data = self.get_save_table(data) if table is None: Log.warn( "File [%s] has no [%s] or [%s] field, data: %r" \ %(f, self.KEY_TABLE, self.KEY_TMP_TABLE, data) ) continue data = self.convert_data(data) if data is None: continue # get data id and save data each self.SAVE_LIMIT data_id = data[self.KEY_ID] if self.data_to_save.has_key(table): self.data_to_save[table].append(data) else: self.data_to_save[table] = [data] # global counter if self.counter.has_key(table): self.counter[table] += 1 else: self.counter[table] = 1 # up to self.SAVE_LIMIT, save data if self.counter[table] % self.SAVE_LIMIT == 0: save_result = self.save(db, table, self.data_to_save[table]) if save_result is not None: self.show_save_result(db, table, save_result) else: log.warn("save result is None! file: [%s] db: [%s], table: [%s], data: %r" %(f, db, table, self.data_to_save[table])) self.data_to_save[table] = [] # save left datas to db if len(self.data_to_save) > 0: for (table, data) in self.data_to_save.items(): if len(data) == 0: continue save_result = self.save(db, table, data) if save_result is not None: self.show_save_result(db, table, save_result) self.data_to_save[table] = [] self.mv_files(db, f, file_dir, out_dir)
def convert_data(self, data): # get content and compress if data.has_key(self.KEY_CONTENT): unc_text = data[self.KEY_CONTENT].encode('utf-8') compressed = snappy.compress(unc_text) compressed = bson.binary.Binary(compressed) data[self.KEY_CONTENT] = compressed if data is None: Log.warn("Data is none..., file: %s" % f) return None # convert int field from string to int for int_field in self.INT_FILED_LIST: try: if data.has_key(int_field): data[int_field] = int(float(data[int_field])) except: Log.warn("Data field convert int failed, data: %r" % data) data = None return data