Beispiel #1
0
 def scan_files(dir, file_prefix = None, file_suffix = None, loop_limit = 5):
     all_files = []
     if not os.path.isdir(dir):
         Log.debug("SCAN-DIR dir [%s] is not exists or not a dir" % dir)
         return []
     files = os.listdir(dir)
     for file in files:
         loop_limit_now = loop_limit
         f = dir+'/' +file
         if os.path.isdir(f):
             if loop_limit_now <= 0: return []
             sub_files = Scaner.scan_files(f, file_prefix, file_suffix, loop_limit_now - 1)
             all_files = all_files + sub_files
         elif os.path.isfile(f):
             if (file_prefix is not None) and (file.find(file_prefix) != 0):
                 continue
             if file_suffix is not None:
                 if type(file_suffix) is types.StringType:
                     if not Scaner.has_suffix(file, file_suffix):
                         continue
                 else:
                     if not Scaner.has_suffixes(file, file_suffix):
                         continue
             tmp_files = []
             tmp_files.append([f, os.stat(f).st_mtime])
             tmp_files = sorted(tmp_files, key = lambda f: f[1])
             for i in tmp_files: all_files.append(i[0])
     return all_files
Beispiel #2
0
 def read(self):
     for line in self.file_handler:
         self.current_count += 1
         line = line.strip()
         line_data = line.split(self.separator)
         if self.has_title:
             line_len = len(line_data)
             if self.title_len != line_len:
                 Log.warn("Line count wrong: Title: [%s] Data: [%s]" % (','.join(self.title), ','.join(line_data)))
                 continue
             line_data = dict(zip(self.title, line_data))
         yield line_data
Beispiel #3
0
 def mv_files(self, db, filepath, from_dir, out_dir):
     if len(filepath) == 0:
         Log.error("MV-FILE empty filename, db: [%s], file: [%s]" %(db, filepath))
         return False
     child_path = filepath.replace(from_dir, '')
     to_path = out_dir + child_path
     to_dir = os.path.dirname(to_path)
     if not os.path.exists(to_dir):
         os.makedirs(to_dir, 0755)
     Log.info("MV-FILE from [%s] to [%s]" %(filepath, to_path))
     mv_files = os.renames(filepath, to_path)
     return mv_files
Beispiel #4
0
 def save(self, db, table, data):
     mongo_factory = MongoFactory()
     mongo = mongo_factory.get_mongo(db)
     if mongo is None:
         Log.error("get none as mongo, please check config of db :"+db+", table:"+table)
         return None
     save_for_log = "MONGO-INFO start saving data to [%s:%s] [%s.%s]" %(mongo.host, mongo.port, db, table)
     Log.info(save_for_log)
     if len(data) == 0:
         return None
     else:
         return mongo.check_alive().set_table_name(table).bulk_save(data)
Beispiel #5
0
 def get_mongo(self, conf_path):
     conf = DaMongo.get_conf(conf_path)
     if conf is None:
         Log.error('MongoFactory no such conf_path [' + conf_path +']')
         return None
     host = conf['host']
     port = conf['port']
     db = conf['db']
     key = host + port
     if self.__mongos.has_key(key):
         return self.__mongos[key].set_db_name(conf['db'])
     else:
         mongo = DaMongo(conf['host'], conf['port'], conf['db'])
         self.__mongos[key] = mongo
         return mongo
Beispiel #6
0
 def convert_data(self, data):
     # get content and compress
     if data.has_key(self.KEY_CONTENT):
         unc_text = data[self.KEY_CONTENT].encode('utf-8')
         compressed = snappy.compress(unc_text)
         compressed = bson.binary.Binary(compressed)
         data[self.KEY_CONTENT] = compressed
     if data is None:
         Log.warn("Data is none..., file: %s" % f)
         return None
     # convert int field from string to int
     for int_field in self.INT_FILED_LIST:
         try:
             if data.has_key(int_field):
                 data[int_field] = int(float(data[int_field]))
         except:
             Log.warn("Data field convert int failed, data: %r" % data)
             data = None
     return data
Beispiel #7
0
 def import_data(self, db, files, file_dir, out_dir):
     '''
     import data to mongo
     '''
     Log.info("IMPORT-FILE prepare save %r to %s" %(files, db))
     # begin save datas to db
     for f in files:
         with get_source_file(f) as source:
             Log.info("IMPORT-FILE begin process file [%s]" % f)
             if source is None:
                 Log.warn("IMPORT-FILE no such file [%s]" % f)
                 continue
             for data in source.read():
                 # get table field for save
                 table, data = self.get_save_table(data)
                 if table is None:
                     Log.warn(
                         "File [%s] has no [%s] or [%s] field, data: %r" \
                         %(f, self.KEY_TABLE, self.KEY_TMP_TABLE, data)
                     )
                     continue
                 data = self.convert_data(data)
                 if data is None:
                     continue
                 # get data id and save data each self.SAVE_LIMIT
                 data_id = data[self.KEY_ID]
                 if self.data_to_save.has_key(table):
                     self.data_to_save[table].append(data)
                 else:
                     self.data_to_save[table] = [data]
                 # global counter
                 if self.counter.has_key(table):
                     self.counter[table] += 1
                 else:
                     self.counter[table] = 1
                 # up to self.SAVE_LIMIT, save data
                 if self.counter[table] % self.SAVE_LIMIT == 0:
                     save_result = self.save(db, table, self.data_to_save[table])
                     if save_result is not None:
                         self.show_save_result(db, table, save_result)
                     else:
                         log.warn("save result is None! file: [%s] db: [%s], table: [%s], data: %r" %(f, db, table, self.data_to_save[table]))
                     self.data_to_save[table] = []
         # save left datas to db
         if len(self.data_to_save) > 0:
             for (table, data) in self.data_to_save.items():
                 if len(data) == 0: continue
                 save_result = self.save(db, table, data)
                 if save_result is not None:
                     self.show_save_result(db, table, save_result)
                 self.data_to_save[table] = []
         self.mv_files(db, f, file_dir, out_dir)
Beispiel #8
0
 def show_save_result(db, table, save_result):
     '''
     log save result
     Save result as below:
     {'nUpserted': 0, 'nMatched': 200, 'upserted': [{_index: INDEX, _id: ID}], 'writeConcernErrors': [], 'nInserted': 0, 'nRemoved': 0, 'writeErrors': []}
     '''
     log = "SAVE-INTO [%s.%s] " %(db, table)
     if save_result.has_key('nUpserted'):
         log += "UPSERT-COUNT: [%d], " % save_result['nUpserted']
         del(save_result['nUpserted'])
     if save_result.has_key('nMatched'):
         log += "SAVE-COUNT: [%d], " % save_result['nMatched']
         del(save_result['nMatched'])
     if save_result.has_key('upserted') and save_result['upserted']:
         log += "UPSERT: "
         for i in save_result['upserted']:
             log += i['_id'] + ', '
         del(save_result['upserted'])
     for x, y in save_result.items():
         log += "%s: %r " %(x, y)
     Log.info(log)
Beispiel #9
0
 def __exit__(self, type, value, trace):
     self.time_end = datetime.datetime.now()
     table_count = '\n'.join("SAVE-INFO: %s: %d" % (table, count) for table, count in self.counter.items())
     self.counter.clear()
     if table_count.strip() != '':
         Log.info("Begin at [" + self.time_begin.strftime("%Y-%m-%d %H:%M:%S") + "]")
         Log.info(table_count)
         Log.info("End at [" + self.time_end.strftime("%Y-%m-%d %H:%M:%S") + "]")
Beispiel #10
0
        if not os.path.exists(to_dir):
            os.makedirs(to_dir, 0755)
        Log.info("MV-FILE from [%s] to [%s]" %(filepath, to_path))
        mv_files = os.renames(filepath, to_path)
        return mv_files


    def __enter__(self):
        return self

    def __exit__(self, type, value, trace):
        self.time_end = datetime.datetime.now()
        table_count = '\n'.join("SAVE-INFO: %s: %d" % (table, count) for table, count in self.counter.items())
        self.counter.clear()
        if table_count.strip() != '':
            Log.info("Begin at [" + self.time_begin.strftime("%Y-%m-%d %H:%M:%S") + "]")
            Log.info(table_count)
            Log.info("End at [" + self.time_end.strftime("%Y-%m-%d %H:%M:%S") + "]")

def import_data():
    return DataImport()

if __name__ == '__main__':
    start_info = "Start at [%s]" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    Log.info(start_info)
    while True:
        with import_data() as di:
            di.scan()
            sys.stdout.flush()
            time.sleep(1)