def extract_comments(filename, top_n_subreddits): # reading a bz2 archive with bzopen(filename, "r") as bzfin: """ Handle lines here """ cbz = codecs.iterdecode(bzfin, "utf-8") reddit_submissions = [] for i, line in enumerate(cbz): try: post_dict = json.loads(line.rstrip()) ''' { "created_utc":1506816000, "link_id":"t3_73ieyz" } ''' post_id, comment_date, subreddit_id = post_dict[ 'link_id'], post_dict['created_utc'], post_dict[ 'subreddit_id'] if i % 100000 == 0: print(i, post_id, comment_date, subreddit_id) if subreddit_id in top_n_subreddits: subreddits_noComments[subreddit_id] += 1 except: continue return reddit_submissions
def update_relationships(self, year, month): fname = os.path.join(AS_REL_DATA_DIR, f'{year}{month}01.as-rel.txt.bz2') if not os.path.exists(fname): return with bzopen(fname, 'r') as f: for i, line in enumerate(f): line = line.decode('utf-8').strip() if line.startswith('#'): continue else: l = line.split('|') as1 = int(l[0]) as2 = int(l[1]) relationship_type = int(l[2]) if relationship_type == -1: r = 'c2p' elif relationship_type == 0: r = 'p2p' else: raise Exception('Invalid relationship type!') if (as1, as2) not in self.G.edges: self.G.add_edge(as1, as2, label=r, timestamp=datetime.date( year=int(year), month=int(month), day=1))
def open_file_by_mimetype(filename, mode): """ This function determines the compression MIME type of a file as gz, bz, or none, and returns an open file handle of the requested mode ('w', 'r', or 'a') """ if mode != 'r' and mode != 'w' and mode != 'a': print("please specific a valid mode: w, r, a") return if guess_type(filename)[1] == 'gzip': try: fh = gzopen(filename, mode) except Exception as error: print("Error opening file ", filename, ": ", error) return elif guess_type(filename) == 'bzip2': try: fh = bzopen(filename, mode) except Exception as error: print("Error opening file ", filename, ": ", error) return else: try: fh = open(filename, mode) except Exception as error: print("Error opening file ", filename, ": ", error) return return fh
def extract_comments(filename, top_n_subreddits): post_commentTimes = defaultdict(list) # reading a bz2 archive with bzopen(filename, "r") as bzfin: """ Handle lines here """ cbz = codecs.iterdecode(bzfin, "utf-8") # print(len(list(cbz))) for i, line in enumerate(cbz): if i % 10000000 == 0: print(i) try: post_dict = json.loads(line.rstrip()) # # { # "created_utc":1506816000, # "link_id":"t3_73ieyz" # } post_id, comment_date, subreddit_id, comment_body = post_dict[ 'link_id'], post_dict['created_utc'], post_dict[ 'subreddit_id'], post_dict['body'] if subreddit_id in top_n_subreddits and post_id in user_post_ids: #print(comment_date, comment_body) post_commentTimes[post_id].append( [comment_date, comment_body]) except: continue return post_commentTimes
def main(): # <http://www.wikidata.org/entity/Q47133351> <http://www.wikidata.org/prop/direct/P356> "10.1002/EJP.1050" . REGEX = r'^<http:\/\/www\.wikidata\.org\/entity\/(Q\d+)> <http:\/\/www.wikidata.org\/prop\/direct\/(P\d+)> "(.*?)" \.$' manifest = ['P356', 'P698', 'P932', 'P2880'] dump_location = '/public/dumps/public/wikidatawiki/entities/latest-truthy.nt.bz2' to_add = {x: [] for x in manifest} with bzopen(dump_location, 'r') as f: for line in f: line = line.decode('utf-8') match = re.match(REGEX, line) if match is None: continue wd_item = match.group(1) wd_prop = match.group(2) wd_value = match.group(3) if wd_prop in manifest: #print('Up to', wd_item, end='\r') to_add[wd_prop].append((wd_item, wd_value)) if len(to_add[wd_prop]) >= 10000: print('\nSaving to Redis') wikidata_to_x = {x[0]: x[1] for x in to_add[wd_prop]} x_to_wikidata = {x[1]: x[0] for x in to_add[wd_prop]} REDIS.hmset( '{0}_to_wikidata_{1}'.format(wd_prop, today), x_to_wikidata) REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today), wikidata_to_x) to_add[wd_prop] = [] # If there are leftovers for wd_prop, tuplelist in to_add.items(): wikidata_to_x = {x[0]: x[1] for x in tuplelist} x_to_wikidata = {x[1]: x[0] for x in tuplelist} REDIS.hmset( '{0}_to_wikidata_{1}'.format(wd_prop, today), x_to_wikidata) REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today), wikidata_to_x) # Finalize for wd_prop in manifest: REDIS.rename('{0}_to_wikidata_{1}'.format(wd_prop, today), '{0}_to_wikidata'.format(wd_prop)) REDIS.rename('wikidata_to_{0}_{1}'.format(wd_prop, today), 'wikidata_to_{0}'.format(wd_prop))
def main(): call_ctr = 0 sms_ctr = 0 for day in days_of_the_year: print("Processing day:", day) commsfile = file_start + day + file_end with bzopen(commsfile) as infile: print(infile.readline()) for entry in infile: hour, tid, user, call, sms = entry.strip().split(',') call_ctr += int(call) sms_ctr += int(sms) print("Total comms for the year, calls:", call_ctr, "sms:", sms_ctr)
def main(): options = opt_get() in_file = options.in_file out_file = options.out_file n = int(options.n) compression_type = options.compression_type if compression_type not in ["none", "b", "g"]: sys.exit( "Wrong compression type used: valid options are 'none', 'b' and 'g'." ) if compression_type == "b": from bz2 import BZ2File as bzopen fh = bzopen(in_file, 'r') if compression_type == "g": import gzip fh = gzip.open(in_file, 'r') else: fh = open(in_file, 'r') out = open(out_file, 'w') iters = 0 lost = 0 while True: fq_data = fq_read(fh) if fq_data == "none": print "Processed " + str(iters) + " reads." print "Removed " + str(lost) + " reads (" + str( float(lost) / float(iters) * 100) + "%)" break iters += 1 cut_place = pA_find(fq_data['seq']) if cut_place == 0 or len(fq_data['seq'][:cut_place]) < n: lost += 1 continue out.write(fq_data['header']) out.write(fq_data['seq'][:cut_place] + "\n") out.write(fq_data['spacer']) out.write(fq_data['qual'][:cut_place] + "\n") if iters % 1000 == 0: sys.stdout.write('%s\r' % iters) sys.stdout.flush()
def main(): with bzopen('assets/crossref_references.jsonl.bz2', 'r') as f: for line in f: print('. ', end='', flush=True) while threading.active_count() >= THREAD_LIMIT: time.sleep(0.25) mapping = json.loads(line) doi_x = list(mapping.keys())[0] lookup = [doi_x] for doi_y in mapping[doi_x]: lookup.append(doi_y) t = threading.Thread(target=process_bundle, args=(lookup, doi_x)) t.daemon = True t.start()
def extract_submissions(filename, top_n_subreddits): # reading a bz2 archive with bzopen(filename, "r") as bzfin: """ Handle lines here """ cbz = codecs.iterdecode(bzfin, "utf-8") reddit_submissions = [] for i, line in enumerate(cbz): try: post_dict = json.loads(line.rstrip()) #print(post_dict) ''' { 'num_comments': 0, 'author': 'magicks', 'name': 't3_eut41', 'subreddit_id': 't5_2qh0u', 'created_utc': 1293944394, } n_topics, n_posts 100, 220304 200, 255879 500, 306173 1000, 338408 ''' # score = upvotes - downvotes post_id, subreddit_id, user, post_date, num_comments, score = post_dict['name'], post_dict['subreddit_id'],\ post_dict['author'], post_dict['created_utc'],\ post_dict['num_comments'], post_dict['score'] #print(post_id, num_comments, score) if subreddit_id in top_n_subreddits and user !='[deleted]': reddit_submissions.append([post_id, subreddit_id, user, post_date, num_comments, score]) num_posts_per_subreddit[subreddit_id]+=1 except: continue return reddit_submissions
def process_file(path, output_path): output_length = [] with bzopen(path) as infile: # unzips the file filename = path print(filename) for i,line in enumerate(infile): try: tweet = json.loads(line) except json.JSONDecodeError as e: print(f'Error in {infile} line {i}: {e}') if not 'text' in tweet: # this jumps to the next line in the json if there's no text print(f'Error in {infile} line {i}: No text element to read from') continue else: text_field = tweet['text'] if contains_regex(tweet=text_field, regex=hillary): #output.append(tweet) with open(output_path, 'a+') as outfile: pos1 = json.dumps(tweet) outfile.write(pos1 + '\n') print('Wrote tweets to json. Next file.')
def process_jsonlines_hotpotqa(filename): """ This is process_jsonlines method for intro-only processed_wikipedia file. The item example: {"id": "45668011", "url": "https://en.wikipedia.org/wiki?curid=45668011", "title": "Flouch Roundabout", "text": ["Flouch Roundabout is a roundabout near Penistone, South Yorkshire, England, where the A628 meets the A616."], "charoffset": [[[0, 6],...]] "text_with_links" : ["Flouch Roundabout is a roundabout near <a href=\"Penistone\">Penistone</a>, <a href=\"South%20Yorkshire\">South Yorkshire</a>, England, where the <a href=\"A628%20road\">A628</a> meets the <a href=\"A616%20road\">A616</a>."], "charoffset_with_links": [[[0, 6], ... [213, 214]]]} """ # item should be nested list extracted_items = [] # with jsonlines.open(filename) as reader: with bzopen(filename, "r") as bzfin: for obj in jsonlines.Reader(bzfin): wiki_id = obj["id"] title = obj["title"] title_id = make_wiki_id(title, 0) plain_text = "\t".join(obj["text"]) text_with_links = "\t".join(obj["text_with_links"]) hyper_linked_titles = [] hyper_linked_titles = find_hyper_linked_titles(text_with_links) if len(hyper_linked_titles) > 0: hyper_linked_titles_text = "\t".join(hyper_linked_titles) else: hyper_linked_titles_text = "" extracted_items.append({ "wiki_id": wiki_id, "title": title_id, "plain_text": plain_text, "hyper_linked_titles": hyper_linked_titles_text, "original_title": title }) return extracted_items
def extract_comments(filename, top_n_subreddits): # reading a bz2 archive with bzopen(filename, "r") as bzfin: """ Handle lines here """ cbz = codecs.iterdecode(bzfin, "utf-8") #print(len(list(cbz))) reddit_submissions = [] for i, line in enumerate(cbz): try: post_dict = json.loads(line.rstrip()) # #{ # "created_utc":1506816000, # "link_id":"t3_73ieyz" #} post_id, comment_date, subreddit_id = post_dict['link_id'], post_dict['created_utc'], post_dict['subreddit_id'] #if i % 100000 == 0: # print(i, post_id, comment_date, subreddit_id) if subreddit_id in top_n_subreddits: post_commentTimes[post_id].append(comment_date) except: continue return reddit_submissions
def __init__(self, path): self.file = bzopen(path)
filename_out = str(sys.argv[2]) min_token_len = 1 satzzeichen = ',.?!:;<>()/\{}#"\'´`‚’‘_→[]-~«»' exclude_zeichen = '*/=→[]."' exclude_sonstiges = ('\textit', '\t', '\xa0', '\small', '\sharp', '\markup', '\concat', '\flat', '\override', '\translate', '\set', '\new') sen_num = 0 lines_dropped = 0 with bzopen(filename) as bzin, open(filename_out, 'w') as txt_out: for line in bzin: try: line = line.decode('utf-8') # print(line) #if("<doc" in line or "</doc>" in line): # continue try: line_json = json.loads(line) text = line_json['text'] #.replace('\n','') except: text = line for a in exclude_sonstiges: text = text.replace(a, ' ')
subreddits = {'subreddit': []} for m in unique_months: threads[m] = [] users[m + '_num_thread'] = [] users[m + '_num_comment'] = [] users[m + '_num_subreddit'] = [] subreddits[m + '_num_thread'] = [] subreddits[m + '_num_comment'] = [] subreddits[m + '_num_user'] = [] subreddits[m + '_num_unique_user'] = [] for f in comment_files: with bzopen(comment_dir + f, 'rb') as bzfin: month = f[:f.index('.')] print("\nStart working on month:", month) i = 0 for line in bzfin.readlines(): line = line.decode('utf-8') line = json.loads(line) if i % 10000 == 0: print(i, 'Start extracting attributes of COMMENTS (comments))') # comments attributes comment_id = line['id'] comment = line['body'] tokens = word_tokenize(comment) subreddit = line['subreddit']
def parser_target(): global prefixes, attacks while True: try: now = dates.get(block=False, timeout=0.125) except: return filename = '../route-views/{}.csv.bz2'.format(now) printed = False while not os.path.isfile(filename): if not printed: print('{} waiting for parse to finish'.format(now)) printed = True time.sleep(10) print('{} summarizing'.format(now)) start_time = time.time() num_v4_prefixes = 0 num_v6_prefixes = 0 num_unique_v4_prefixes = 0 num_unique_v6_prefixes = 0 num_v4_isolated = 0 num_v6_isolated = 0 num_v4_distributed = 0 num_v6_distributed = 0 seen_prefixes = set() attacked_prefixes = set() with bzopen(filename, 'rt') as f: reader = csv.reader(f) next(reader) for line in reader: ip_type = line[2] prefix = line[3] if ip_type == '4': num_v4_prefixes += 1 elif ip_type == '6': num_v6_prefixes += 1 seen_prefixes.add(prefix) thread_safe_add(prefix, ip_type) if line[5].rstrip() == 'I': if prefix not in attacked_prefixes: if ip_type == '4': num_v4_isolated += 1 elif ip_type == '6': num_v6_isolated += 1 thread_safe_update(prefix, 2) attacked_prefixes.add(prefix) elif line[5].rstrip() == 'D': if prefix not in attacked_prefixes: if ip_type == '4': num_v4_distributed += 1 elif ip_type == '6': num_v6_distributed += 1 thread_safe_update(prefix, get_num_actors(line[4])) attacked_prefixes.add(prefix) for prefix in seen_prefixes: with prefixes_lock: if prefix in prefixes: if '.' in prefix: num_unique_v4_prefixes += 1 elif ':' in prefix: num_unique_v6_prefixes += 1 end_time = time.time() print('{0} -- {1:.3f} sec'.format(now, end_time - start_time)) record = '{0},{1},{2},{3},{4},{5},{6},{7},{8}\n'.format( now, num_v4_prefixes, num_unique_v4_prefixes, num_v6_prefixes, num_unique_v6_prefixes, num_v4_isolated, num_v6_isolated, num_v4_distributed, num_v6_distributed) outbox.put(record) del attacked_prefixes del seen_prefixes
from bz2 import BZ2File as bzopen all_filenames = !find data -name "*.bz2" with open('mega_file.json', 'wb') as f: for file in all_filenames: stream = bzopen(file, 'r') for line in stream: f.write(line) stream.close()
def generate_header_row(input_files): tmp_conditions = [] for input_file in input_files: ff = os.path.basename(input_file) userid = mapping[ff.split('+',1)[0]] count = 0 tmpbuf = [] with bzopen(input_file, "rb") as bzfin: """ Handle lines here """ starttime = -1 for i, line in enumerate(bzfin): ln = line.decode().rstrip() lsplits = ln.split(',',2) try: ema_json = json.loads(lsplits[2][1:-1]) tmpbuf.append(ema_json) #['status', 'current_time', 'timestamp', 'id', 'logSchedule', 'message', 'type', 'operation'] except Exception as e: print(e) print(lsplits[2][1:-1]) count += 1 groupedbuf = [] tmp = [] srtts = -1 ema_started = False for l in tmpbuf: if 'message' in l and l['message'] == 'true: datapoint not found': ema_started = True continue if ema_started: #if 'message' in l and l['message'] != 'false: some conditions are failed': # if l['message'] != 'true: all conditions okay': tmp.append(l) if 'status' in l and (l['status'] == 'COMPLETED' or l['status'] == 'MISSED' or l['status'] == 'ABANDONED_BY_TIMEOUT'): ema_started = False groupedbuf.append(tmp) tmp = [] if 'message' in l and l['message'] == 'false: some conditions are failed': ema_started = False if len(tmp): groupedbuf.append(tmp) tmp = [] tab = ',' for x in groupedbuf: for cond in x: if 'status' in cond: continue condition = cond['type'] + '-' + cond['id'] if condition not in tmp_conditions: tmp_conditions.append(condition) for cd in tmp_conditions: if 'VALID_BLOCK' in cd: conditions.append(cd) conditions.append('BLOCK') for cd in tmp_conditions: if 'VALID_BLOCK' not in cd: conditions.append(cd)
def parse_log(input_file): ff = os.path.basename(input_file) userid = mapping[ff.split('+',1)[0]] count = 0 tmpbuf = [] csvbuf = '' with bzopen(input_file, "rb") as bzfin: """ Handle lines here """ starttime = -1 for i, line in enumerate(bzfin): ln = line.decode().rstrip() lsplits = ln.split(',',2) try: ema_json = json.loads(lsplits[2][1:-1]) tmpbuf.append(ema_json) #['status'hp, 'current_time', 'timestamp', 'id', 'logSchedule', 'message', 'type', 'operation'] except Exception as e: print(e) print(lsplits[2][1:-1]) count += 1 groupedbuf = [] tmp = [] tmptmp = [] srtts = -1 ema_started = False for l in tmpbuf: #"type": "PRIVACY", "id": "PRIVACY"} #print(l) if 'type' in l and l['type'] == 'PRIVACY' and l['id'] == 'PRIVACY': #print('EMA_STARTED') ema_started = True if len(tmp): for aa in tmp: tmptmp.append(aa) tmp = [] #continue if ema_started: #if 'message' in l and l['message'] != 'false: some conditions are failed': # if l['message'] != 'true: all conditions okay': tmp.append(l) if 'status' in l and (l['status'] == 'COMPLETED' or l['status'] == 'MISSED' or l['status'] == 'ABANDONED_BY_TIMEOUT'): #print('EMA_ENDED') ema_started = False if len(tmp): groupedbuf.append(tmp) tmp = [] if len(tmptmp): tmptmp.append(l) groupedbuf.append(tmptmp) #print(tmptmp) tmptmp = [] #if 'message' in l and l['message'] == 'false: some conditions are failed': if 'message' in l and 'false:' in l['message']: #print('EMA_ENDED_AAAAAAAAAAAAAAAAAAAA') ema_started = False if len(tmp): groupedbuf.append(tmp) tmp = [] ''' for x in groupedbuf: for y in x: print(y) ''' tab = ',' dup_list = [] for x in groupedbuf: if not len(x): continue csv_entry = userid + tab if 'status' in x[-1]: csv_entry += x[0]['current_time'] + tab+ x[-1]['id'] + tab + x[-1]['status'] else: tmpid = x[-1]['id'] if 'EMA' not in tmpid: for y in x: if 'EMA' in y['id']: tmpid = y['id'] if 'VALID_BLOCK_' in tmpid: tmpid = tmpid.split('VALID_BLOCK_')[1] break else: if 'VALID_BLOCK_' in tmpid: tmpid = tmpid.split('VALID_BLOCK_')[1] csv_entry += x[-1]['current_time'] + tab+ tmpid + tab + 'NOT_DELIVERED' allconds = {} for cond in x: if 'status' in cond: continue condition = cond['type'] + '-' + cond['id'] allconds[condition] = cond['message'] block = -1 for acond in conditions: #print(acond) if acond in allconds: tmpstr = allconds[acond] if 'VALID_BLOCK' in acond: splits = tmpstr.split(':',1) csv_entry += tab + splits[0].strip() + tab + splits[1].strip() blocks = splits[1].split('block(') if len(blocks) > 1: block = splits[1].split('block(')[1][0] else: splits = tmpstr.split(':',1) csv_entry += tab + splits[0].strip() + tab + splits[1].strip() elif acond == 'BLOCK': csv_entry += tab + str(block) continue else: csv_entry += tab + tab #print(repr(csv_entry)) csv_entry += '\n' if csv_entry not in dup_list: csvbuf += csv_entry dup_list.append(csv_entry) else: print('D'*50) #exit(1) return csvbuf
from bz2 import BZ2File as bzopen ## Find all IPs that are aliases with both sides of the SACS link in January 2019 ITDK_folders = ['/data/topology/ITDK/ITDK-2019-01/'] ### Add list of potential IPs here #Brazil Potential_IPs_RB = ['170.238.233.58', '170.238.233.56', '170.238.232.66', '170.238.232.82', '170.238.232.145'] #Angola Potential_IPs_RA = ['170.238.232.146', '197.149.149.162', '170.238.232.150', '170.238.232.155'] for ITDK_folder in ITDK_folders: # reading a bz2 archive with bzopen(ITDK_folder + "kapar-midar-iff.nodes.bz2", "r") as bzfin: """ Handle lines here """ lines_RB = [] List_IPs_RB = [] lines_RA = [] List_IPs_RA = [] for i, line in enumerate(bzfin): """ look for nodes containing potential_IPs_RB """ for link_IP in Potential_IPs_RB: if link_IP in line and line.rstrip() not in lines_RB: lines_RB.append(line.rstrip()) """ look for nodes containing potential_IPs_RA """ for link_IP in Potential_IPs_RA:
def main(): # <http://www.wikidata.org/entity/Q47133351> <http://www.wikidata.org/prop/direct/P356> "10.1002/EJP.1050" . REGEX = r'^<http:\/\/www\.wikidata\.org\/entity\/(Q\d+)> <http:\/\/www.wikidata.org\/prop\/direct\/(P\d+)> "(.*?)" \.$' manifest = ['P356', 'P698', 'P932', 'P2880'] dump_location = 'https://dumps.wikimedia.org/wikidatawiki/entities/latest-truthy.nt.bz2' to_add = {x: [] for x in manifest} print("Downloading latest dump") with requests.get(dump_location, stream=True) as r: print("Saving dump") r.raise_for_status() with open('/tmp/latest-truthy.nt.bz2', 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) print("Opening dump") with bzopen('/tmp/latest-truthy.nt.bz2', 'r') as f: for line in f: line = line.decode('utf-8') match = re.match(REGEX, line) if match is None: continue wd_item = match.group(1) wd_prop = match.group(2) wd_value = match.group(3) if wd_prop in manifest: #print('Up to', wd_item, end='\r') to_add[wd_prop].append((wd_item, wd_value)) if len(to_add[wd_prop]) >= 10000: print('\nSaving to Redis') wikidata_to_x = {x[0]: x[1] for x in to_add[wd_prop]} x_to_wikidata = {x[1]: x[0] for x in to_add[wd_prop]} REDIS.hmset( '{0}_to_wikidata_{1}'.format(wd_prop, today), x_to_wikidata) REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today), wikidata_to_x) to_add[wd_prop] = [] # If there are leftovers for wd_prop, tuplelist in to_add.items(): wikidata_to_x = {x[0]: x[1] for x in tuplelist} x_to_wikidata = {x[1]: x[0] for x in tuplelist} REDIS.hmset( '{0}_to_wikidata_{1}'.format(wd_prop, today), x_to_wikidata) REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today), wikidata_to_x) # Finalize for wd_prop in manifest: REDIS.rename('{0}_to_wikidata_{1}'.format(wd_prop, today), '{0}_to_wikidata'.format(wd_prop)) REDIS.rename('wikidata_to_{0}_{1}'.format(wd_prop, today), 'wikidata_to_{0}'.format(wd_prop))
filename_mask = "output/AA/wiki_%.2d.bz2" num_range = 44 regex = "([^ ]+ 1[0-9]{3} [^ ]+)" pres = defaultdict(int) posts = defaultdict(int) for x in range(num_range): if "%" in filename_mask: filename = filename_mask % x else: filename = filename_mask print('Filename:', filename) with bzopen(filename) as bzin: for line in bzin: try: line = line.decode('utf-8') if line[-1] == '/n': line = line[:-1] begin, end = finde_ausdruck(regex, line) if end: exp = line[begin:end] split = exp.split() pres[split[0]] += 1 posts[split[2]] += 1 except: print("Error in line:", line)
if max_files != -1: if files_processed > max_files: pickle.dump( charger_data, open( "save_charger_data.p", "wb" ) ) print("exiting max files processed,saving picklefile") exit() files_processed += 1 end = f_name.find('.charging-locations.kml.bz2') time_str = f_name[end-8:end] date_str = f_name[end-19:end-9] date_str = date_str.replace('_','/') #print(date_str + ' ' + time_str) # reading a bz2 archive with bzopen(f_name, "r") as bzfin: kml_data = bzfin.read() try: kml_root = pykml.parser.fromstring(kml_data) place_list = kml_root.Document.Placemark except: print(('ERROR:Catpured parce error for file' + f_name)) place_list = [] for i in place_list: name = str(i.name).encode('utf-8') coord =i.Point.coordinates desc = str(i.description).encode('utf-8')