def parse_actionlogs(logfile, metafile, output_dir): """ Convert actionlog to CSV format Args: Returns: Raises: """ logfile_name = basename(logfile) user_id = int(re.search('^(\d+)\.', logfile_name).group(1)) sys.stderr.write('User id: %d%s' % (user_id, os.linesep)) ui_dict = get_ui_dict_from_meta(metafile) with open(logfile) as in_file: with open('%s/%s.csv' % (output_dir,logfile_name),'w') as out_file: out_csv = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) for i,line in enumerate(in_file): events = line.strip().split('|') ui_id = ui_dict[i] filtered_events = filter_events(events, user_id, i, ui_id) wrote_header = False for e in filtered_events: if i == 0 and not wrote_header: out_csv.writerow(list(e._fields)) wrote_header = True out_csv.writerow([x for x in e._asdict().itervalues()]) sys.stderr.write('Parsed %d event logs%s' % (i+1, os.linesep))
def write_rows_to_csv(row_list, filename): """ Write a list of namedtuple objects to a (unicode) CSV file. Args: row_list A list of namedtuple objects Returns: Raises: """ with open(filename,'w') as out_file: csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) write_header = True for row in row_list: if write_header: write_header = False csv_file.writerow(list(row._fields)) csv_file.writerow([x for x in row._asdict().itervalues()])
def parse_actionlogs(logfile, metafile, output_dir): """ Convert actionlog to CSV format Args: Returns: Raises: """ logfile_name = basename(logfile) user_id = int(re.search('^(\d+)\.', logfile_name).group(1)) sys.stderr.write('User id: %d%s' % (user_id, os.linesep)) ui_dict = get_ui_dict_from_meta(metafile) with open(logfile) as in_file: with open('%s/%s.csv' % (output_dir, logfile_name), 'w') as out_file: out_csv = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) for i, line in enumerate(in_file): events = line.strip().split('|') ui_id = ui_dict[i] filtered_events = filter_events(events, user_id, i, ui_id) wrote_header = False for e in filtered_events: if i == 0 and not wrote_header: out_csv.writerow(list(e._fields)) wrote_header = True out_csv.writerow([x for x in e._asdict().itervalues()]) sys.stderr.write('Parsed %d event logs%s' % (i + 1, os.linesep))
def make_frame(profile_dir, user_data_file, user_id_list): """ Converts HTML in profile_dir, CSV in id_list_file, and CSV in user_data_file to an R data frame. Args: Returns: Raises: """ userdata = ptm_file_io.load_raw_user_data(user_data_file) rows = [] for root,dirs,files in os.walk(profile_dir): if len(files) > 0: has_html = reduce(lambda x,y:x or y, map(lambda x:x.find('html') > 0, files)) if has_html and root.find('svn') < 0: row = get_user_row(root, files, userdata) rows.append(row) # Output the data frame csv_file = UnicodeWriter(sys.stdout, quoting=csv.QUOTE_ALL) write_header = True for row in rows: if user_id_list and row.user_id not in user_id_list: continue if write_header: write_header = False csv_file.writerow(list(row._fields)) csv_file.writerow([x for x in row._asdict().itervalues()])
def make_pause_file(actionlog, output_dir): """ Generates a pause duration file from an action log Args: Returns: Raises: """ with open('%s/actionlog.pause.csv' % (output_dir), 'w') as out_file: out_csv = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) write_headers = True with open(actionlog) as in_file: in_csv = UnicodeReader(out_file) last_event = None for event in map(Event._make, UnicodeReader(in_file)): if last_event == None or not last_event.userid == event.userid: # Skip start event last_event = event continue duration = int(event.time) - int(last_event.time) # Should probably discard below a set typing speed # What is the min pause between keystrokes? row = Row(sourceid=event.sourceid, userid=event.userid, duration=str(duration), event_before=last_event.event_class, event_after=event.event_class) if write_headers: out_csv.writerow(list(row._fields)) write_headers = False out_csv.writerow([x for x in row._asdict().itervalues()]) last_event = event
def generate_csv(pk, lpk, tsv_file): """ Convert tsv to csv. Column order is: header_row = ['id','lang_id','txt','doc','seg'] Args: Returns: Raises: """ in_file = codecs.open(tsv_file, encoding='utf-8') out_file_name = basename(tsv_file) + '.csv' out_file = open(out_file_name, 'w') csv_out = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) n_lines = 0 for line in in_file: n_lines += 1 (doc_id, seg_id, txt) = line.strip().split('\t') row = [str(pk), str(lpk), txt, doc_id, seg_id] csv_out.writerow(row) pk += 1 in_file.close() out_file.close() return n_lines
def generate_csv(pk,lpk,tsv_file): """ Convert tsv to csv. Column order is: header_row = ['id','lang_id','txt','doc','seg'] Args: Returns: Raises: """ in_file = codecs.open(tsv_file,encoding='utf-8') out_file_name = basename(tsv_file) + '.csv' out_file = open(out_file_name,'w') csv_out = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) n_lines = 0 for line in in_file: n_lines += 1 (doc_id, seg_id, txt) = line.strip().split('\t') row = [str(pk), str(lpk), txt, doc_id, seg_id] csv_out.writerow(row) pk += 1 in_file.close() out_file.close() return n_lines
def make_frame(directory, user_ids, out_prefix, ref_filename, ranking_file): """ Args: Returns: Raises: """ rankings = ptm_file_io.load_ranking_file(ranking_file) with codecs.open(ref_filename,encoding='utf-8') as ref_infile: ref_file = [x.strip() for x in ref_infile.readlines()] dir_name = basename(dirname(directory)) out_file_name = '%s.%s.csv' % (out_prefix, dir_name) with open(out_file_name,'w') as out_file: csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) write_header = True for user_id in user_ids: for row in get_rows(directory, user_id, ref_file, rankings): if write_header: write_header = False csv_file.writerow(list(row._fields)) csv_file.writerow([x for x in row._asdict().itervalues()])
def write_rows_to_csv(row_list, filename): """ Write a list of namedtuple objects to a (unicode) CSV file. Args: row_list A list of namedtuple objects Returns: Raises: """ with open(filename, 'w') as out_file: csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) write_header = True for row in row_list: if write_header: write_header = False csv_file.writerow(list(row._fields)) csv_file.writerow([x for x in row._asdict().itervalues()])
def make_frame(directory, user_ids, out_prefix, ref_filename, ranking_file): """ Args: Returns: Raises: """ rankings = ptm_file_io.load_ranking_file(ranking_file) with codecs.open(ref_filename, encoding='utf-8') as ref_infile: ref_file = [x.strip() for x in ref_infile.readlines()] dir_name = basename(dirname(directory)) out_file_name = '%s.%s.csv' % (out_prefix, dir_name) with open(out_file_name, 'w') as out_file: csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL) write_header = True for user_id in user_ids: for row in get_rows(directory, user_id, ref_file, rankings): if write_header: write_header = False csv_file.writerow(list(row._fields)) csv_file.writerow([x for x in row._asdict().itervalues()])
def test(cursor): cursor.execute("""SELECT distinct(source) FROM flows""") set_flow= set(cursor) print "distinct source in flow", len(set_flow) cursor.execute("""SELECT distinct(source) FROM exchange_rates""") set_ex = set(cursor) print "distinct source in exchange_rates", len(set_ex) cursor.execute("""SELECT slug FROM sources""") set_source = set(cursor) print "nb elem in source", len(set_source) missing_flow_source_list = set_flow - set_source writer = UnicodeWriter(open(os.path.join("../out_data", "missing_flow_source_list" + ".csv"), "wb")) writer.writerows(missing_flow_source_list) missing_ex_source_list = set_ex - set_source writer = UnicodeWriter(open(os.path.join("../out_data", "missing_ex_source_list" + ".csv"), "wb")) writer.writerows(missing_ex_source_list)
next_date = dates[i_date + 1] if current_date == next_date - 1: pass else: periods[-1] = "%s-%s" % ( periods[-1], current_date ) if periods[-1] != current_date else str(current_date) periods.append(next_date) else: # fin 2 : fin de la liste periods[-1] = "%s-%s" % ( periods[-1], current_date ) if periods[-1] != current_date else str(current_date) row[2] = ",".join(periods) if row[1] == None: row[1] = "champs vide" return table writer = UnicodeWriter( open(os.path.join("out_data", 'report_by_sources_and_period.csv'), "wb")) writer.writerow([description[0] for description in c.description]) data = dateByReportingBySource(c.fetchall()) for d in data: print d[2] writer.writerows(data)
print "-------------------------------------------------------------------------" print "cleaning done" conn.commit() print "commited" print "-------------------------------------------------------------------------" ################################################################################ ## Export all tables in csv files ################################################################################ tables = [ "sources", "entity_names", "RICentities", "exchange_rates", "currencies", "expimp_spegen", "RICentities_groups", "flows" ] for item in tables: c.execute("select * from " + item) writer = UnicodeWriter(open(os.path.join("out_data", item + ".csv"), "wb")) writer.writerow([description[0] for description in c.description]) # c.fetchall() writer.writerows(c) print "export " + item + ".csv done" print "-------------------------------------------------------------------------"
def test(cursor): # # Get distinct source in flows, exchnange_rates and sources # cursor.execute("""SELECT distinct(source) FROM flows""") set_flow= set(_ for _ in cursor) print "distinct source in flow", len(set_flow) cursor.execute("""SELECT distinct(source) FROM exchange_rates""") set_ex = set(_ for _ in cursor) print "distinct source in exchange_rates", len(set_ex) cursor.execute("""SELECT distinct(slug) FROM sources""") set_source = set(_ for _ in cursor) print "nb elem in source", len(set_source) # # output missing source in flows # missing_flow_source_list = set_flow - set_source print "flow sources missin in source table", len(missing_flow_source_list) with codecs.open(os.path.join("../out_data/logs", "missing_flow_source_list" + ".csv"), "wb","UTF8") as f: for s in missing_flow_source_list: f.write((s[0] if s[0] is not None else u"") +u"\n") # # output missing source in exchange_rates # missing_ex_source_list = set_ex - set_source print missing_ex_source_list print "Exchange rate missing in source table", len(missing_ex_source_list) with codecs.open(os.path.join("../out_data/logs", "missing_ex_source_list" + ".csv"), "wb","utf8") as f: for s in list(missing_ex_source_list): f.write((s[0] if s[0] is not None else u"") + u"\n") # # output missing source with id in flows # missing_flow_source_list_id =[] flow_matching = 0 for row in missing_flow_source_list: cursor.execute("""SELECT * FROM flows where source=?""",[row[0]]) table = [list(r) for r in cursor] flow_matching+=1 for row in table: missing_flow_source_list_id.append(row) unique_flow = [] for r in missing_flow_source_list_id: if r not in unique_flow: unique_flow.append(r) writer = UnicodeWriter(open(os.path.join("../out_data/logs", "missing_flow_source_list_id" + ".csv"), "wb")) writer.writerows(unique_flow) # # output missing source with id in exchange_rates # missing_ex_source_list_id =[] ex_matching = 0 for row in missing_ex_source_list: cursor.execute("""SELECT * FROM exchange_rates where source=?""",[row[0]]) table = [list(r) for r in cursor] ex_matching+=1 for row in table: # print row missing_ex_source_list_id.append(row) unique_ex = [] for r in missing_ex_source_list_id: if r not in unique_ex: unique_ex.append(r) writer = UnicodeWriter(open(os.path.join("../out_data/logs", "missing_ex_source_list_id" + ".csv"), "wb")) writer.writerows(unique_ex)
next_source = table[i_source+1][2] else: current_source = row[0] next_source = table[i_source+1][0] if (current_source == next_source): newCSV.append(row); else: # csvTitle = unicode(current_source, 'utf-8') print newSource nameStats.append([current_source, len(newCSV)]) csvTitle = unicodedata.normalize('NFD', current_source).encode('ascii', 'ignore') csvTitle = csvTitle.replace(" ", "_") if len(csvTitle) > 255: csvTitle = csvTitle[:200] try: writer = UnicodeWriter(open(os.path.join("./out_data/sources", csvTitle +'.csv'), "w")) writer.writerow([description[0] for description in c.description]) writer.writerows(newCSV) newCSV = [] except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) elem = csvTitle.encode('utf8') errors.append(elem) pass errorsNameFormat = open('./out_data/errors/errorsNameFormat.txt', 'w') for item in errors: print>>errorsNameFormat, item print "errorsNameFormat.txt done"