def main_loop(s3, p, tmp_sql_file, table_name): global global_table_list global global_file_list task = config.task base_key_prefix = task['base_key_prefix'] save_prefix = task['save_prefix'] save_file_list = download_from_s3(s3, base_key_prefix, p, save_prefix) if not save_file_list or not isinstance(save_file_list, (list, tuple)): logging.error("[FAIL]main_loop():[%s]" % save_file_list) return None global_file_list += save_file_list logging.info("save_file_list:%s" % save_file_list) field_list = ['hash', 'event_type', 'arrival_time', 'client_id', 'uid', 'organic', 'lang', 'package', 'referrer', 'country', 'ip_country', 'sim_country', 'client_time'] for save_file in save_file_list: event_list = parse_json(save_file) if not event_list or not event_list: logging.error("main_loop():parse_json:event_list:[%s]" % event_list) sql_list = [] for m in event_list: # create table if not exits the_time = m.get('client_time') if common.is_float(the_time): the_time = float(the_time) else: the_time = m.get('arrival_time') if common.is_float(the_time): the_time = float(the_time) else: the_time = time.time() new_table_name = get_table_name(table_name, the_time) if new_table_name not in global_table_list: global_table_list.append(new_table_name) create_table(tmp_sql_file, new_table_name) sql = common.mysql_gen_save_sql(new_table_name, m, field_list, field_list) sql.encode('utf-8') sql_list.append(sql) with open(tmp_sql_file, 'a')as f: big_sql = '\n'.join(sql_list) f.write(big_sql)
def replace_nan_or_string_values_with_max_in_dictionary( dictionary, reverse=False): non_numeric_keys = [] numeric_values = [] for key, value in dictionary.iteritems(): if is_float(value): if math.isnan(value): non_numeric_keys.append(key) else: numeric_values.append(value) else: non_numeric_keys.append(key) non_numeric_values_replacer = 0 if reverse: non_numeric_values_replacer = min(numeric_values) - 1 else: non_numeric_values_replacer = max(numeric_values) + 1 for non_numeric_key in non_numeric_keys: dictionary[non_numeric_key] = non_numeric_values_replacer return dictionary
def _read_csv(self, key_column_name): path, file = os.path.split(self.path_to_csv) print 'Reading file "' + file + '"...' input_file = open(self.path_to_csv, 'rt') reader = csv.reader(input_file, delimiter=self.delimiter) self.headers = next(reader) self.column_num = len(self.headers) self.row_num = len(open(self.path_to_csv).readlines()) if self.auto_detect_types: int_columns = [True] * len(self.headers) float_columns = [True] * len(self.headers) line_number = 0 bar = progressbar.ProgressBar(maxval=1.0).start() for row in reader: for x in range(0, self.column_num): if self.auto_detect_types: if int_columns[x]: int_columns[x] = is_int(row[x]) if float_columns[x]: float_columns[x] = is_float(row[x]) row_dict = self.__row_to_dict(row) self.data.append(row_dict) if key_column_name: self.data_dict[row_dict[key_column_name]] = row_dict line_number += 1 bar.update((line_number + 0.0) / self.row_num) bar.finish() if self.auto_detect_types: self.__format_data(int_columns, float_columns)
def parse_label(label): """Parse a Newick label which may contain a support value, taxon, and/or auxiliary information. Parameters ---------- label : str Internal label in a Newick tree. Returns ------- float Support value specified by label, or None str Taxon specified by label, or None str Auxiliary information, on None """ support = None taxon = None auxiliary_info = None if label: label = label.strip() if '|' in label: label, auxiliary_info = label.split('|') if ':' in label: support, taxon = label.split(':') support = float(support) else: if is_float(label): support = float(label) elif label != '': taxon = label return support, taxon, auxiliary_info
def parse_json(json_file): logging.info("parse_json():%s" % json_file) event_list = [] _default_value = { 'event_type': '', 'client_time': '1000', 'arrival_time': '1000', 'client_id': '', 'uid': '', 'country': '', 'ip_country': '', 'sim_country': '', 'lang': '', 'referrer': '', # 'ClientActiveTime': '', 'package': '', 'organic': -1, 'hash': '' } # print _default_value.keys() try: with gzip.open(json_file, 'rb') as f: for line in f: line = line.strip() if not line: continue m = _default_value.copy() m['hash'] = hashlib.md5(line).hexdigest() # print "md5:", hashlib.md5(line).hexdigest() # print "sha1sum:", hashlib.sha1(line).hexdigest() j = json.loads(line) m['event_type'] = j.get('event_type', '') m['client_time'] = j.get('event_timestamp', 1000) m['arrival_time'] = j.get('arrival_timestamp', 1000) application = j.get('application') if application: m['package'] = application.get('package_name', '') client = j.get('client') if client: m['client_id'] = client.get('client_id', '') device = j.get('device') if device: locale = device.get('locale') if locale: m['country'] = locale.get('country', '') m['lang'] = locale.get('language', '') attributes = j.get('attributes') if attributes: m['uid'] = attributes.get('DeviceId', '') m['referrer'] = attributes.get('Referrer', '') m['ip_country'] = attributes.get('country', '') m['sim_country'] = attributes.get('simCountry', '') IsOrganic = attributes.get('IsOrganic', '') if IsOrganic == 'true': m['organic'] = 1 elif IsOrganic == 'false': m['organic'] = 0 else: m['organic'] = -1 # m['event_type'] = attributes.get('ClientActiveTime', 1000) for param in ['client_time', 'arrival_time']: val = m[param] if common.is_float(val): val = float(val) / 1000 else: val = 1 m[param] = common.format_utc_time(val) event_list.append(m) except Exception as e: log.err("parse_json:Exception:%s" % e) finally: return event_list
def read_from_tree(self, tree, warnings=True): """Obtain the taxonomy for each extant taxa as specified by internal tree labels. Parameters ---------- tree : str or dendropy.Tree Filename of newick tree or dendropy tree object. Returns ------- dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>] Taxa indexed by unique ids. """ if isinstance(tree, basestring): tree = dendropy.Tree.get_from_path(tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxa = [] node = leaf.parent_node while node: if node.label: taxa_str = node.label if ':' in taxa_str: taxa_str = taxa_str.split(':')[1] if not is_float(taxa_str): if taxa_str[-1] == ';': taxa_str = taxa_str[:-1] # check for concatenated ranks of the form: # p__Crenarchaeota__c__Thermoprotei for prefix in Taxonomy.rank_prefixes: split_str = '__' + prefix if split_str in taxa_str: taxa_str = taxa_str.replace( split_str, ';' + prefix) # appears to be an internal label and not simply a # support value taxa = [x.strip() for x in taxa_str.split(';')] + taxa node = node.parent_node if warnings and len(taxa) > 7: self.logger.warning( 'Invalid taxonomy string read from tree for taxon %s: %s' % (leaf.taxon.label, taxa)) # sys.exit(-1) # check if genus name should be appended to species label if len(taxa) == 7: genus = taxa[5][3:] species = taxa[6][3:] if genus not in species: taxa[6] = 's__' + genus + ' ' + species taxa = self.fill_trailing_ranks(taxa) taxonomy[leaf.taxon.label] = taxa return taxonomy