def parse_actionlogs(logfile, metafile, output_dir):
    """ Convert actionlog to CSV format

    Args:
    Returns:
    Raises:
    """
    logfile_name = basename(logfile)
    user_id = int(re.search('^(\d+)\.', logfile_name).group(1))
    sys.stderr.write('User id: %d%s' % (user_id, os.linesep))
    ui_dict = get_ui_dict_from_meta(metafile)
    with open(logfile) as in_file:
        with open('%s/%s.csv' % (output_dir,logfile_name),'w') as out_file:
            out_csv = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
            for i,line in enumerate(in_file):
                events = line.strip().split('|')
                ui_id = ui_dict[i]
                filtered_events = filter_events(events, user_id, i, ui_id)
                wrote_header = False
                for e in filtered_events:
                    if i == 0 and not wrote_header:
                        out_csv.writerow(list(e._fields))
                        wrote_header = True
                    out_csv.writerow([x for x in e._asdict().itervalues()])
    sys.stderr.write('Parsed %d event logs%s' % (i+1, os.linesep))
Beispiel #2
0
def write_rows_to_csv(row_list, filename):
    """
    Write a list of namedtuple objects to a (unicode) CSV file.
    
    Args:
     row_list A list of namedtuple objects
    Returns:
    Raises:
    """
    with open(filename,'w') as out_file:
        csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
        write_header = True
        for row in row_list:
            if write_header:
                write_header = False
                csv_file.writerow(list(row._fields))
            csv_file.writerow([x for x in row._asdict().itervalues()])
def parse_actionlogs(logfile, metafile, output_dir):
    """ Convert actionlog to CSV format

    Args:
    Returns:
    Raises:
    """
    logfile_name = basename(logfile)
    user_id = int(re.search('^(\d+)\.', logfile_name).group(1))
    sys.stderr.write('User id: %d%s' % (user_id, os.linesep))
    ui_dict = get_ui_dict_from_meta(metafile)
    with open(logfile) as in_file:
        with open('%s/%s.csv' % (output_dir, logfile_name), 'w') as out_file:
            out_csv = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
            for i, line in enumerate(in_file):
                events = line.strip().split('|')
                ui_id = ui_dict[i]
                filtered_events = filter_events(events, user_id, i, ui_id)
                wrote_header = False
                for e in filtered_events:
                    if i == 0 and not wrote_header:
                        out_csv.writerow(list(e._fields))
                        wrote_header = True
                    out_csv.writerow([x for x in e._asdict().itervalues()])
    sys.stderr.write('Parsed %d event logs%s' % (i + 1, os.linesep))
def make_frame(profile_dir, user_data_file, user_id_list):
    """ Converts HTML in profile_dir, CSV in id_list_file, and
    CSV in user_data_file to an R data frame.

    Args:
    Returns:
    Raises:
    """
    userdata = ptm_file_io.load_raw_user_data(user_data_file)

    rows = []
    for root,dirs,files in os.walk(profile_dir):
        if len(files) > 0:
            has_html = reduce(lambda x,y:x or y,
                              map(lambda x:x.find('html') > 0, files))
            if has_html and root.find('svn') < 0:
                row = get_user_row(root, files, userdata)
                rows.append(row)

    # Output the data frame
    csv_file = UnicodeWriter(sys.stdout, quoting=csv.QUOTE_ALL)
    write_header = True
    for row in rows:
        if user_id_list and row.user_id not in user_id_list:
            continue
        if write_header:
            write_header = False
            csv_file.writerow(list(row._fields))
        csv_file.writerow([x for x in row._asdict().itervalues()])
def make_pause_file(actionlog, output_dir):
    """ Generates a pause duration file from an action log

    Args:
    Returns:
    Raises:
    """
    with open('%s/actionlog.pause.csv' % (output_dir), 'w') as out_file:
        out_csv = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
        write_headers = True
        with open(actionlog) as in_file:
            in_csv = UnicodeReader(out_file)
            last_event = None
            for event in map(Event._make, UnicodeReader(in_file)):
                if last_event == None or not last_event.userid == event.userid:
                    # Skip start event
                    last_event = event
                    continue
                duration = int(event.time) - int(last_event.time)
                # Should probably discard below a set typing speed
                # What is the min pause between keystrokes?
                row = Row(sourceid=event.sourceid,
                          userid=event.userid,
                          duration=str(duration),
                          event_before=last_event.event_class,
                          event_after=event.event_class)
                if write_headers:
                    out_csv.writerow(list(row._fields))
                    write_headers = False
                out_csv.writerow([x for x in row._asdict().itervalues()])
                last_event = event
Beispiel #6
0
def generate_csv(pk, lpk, tsv_file):
    """ Convert tsv to csv. Column order is:
        header_row = ['id','lang_id','txt','doc','seg']

    Args:
    Returns:
    Raises:
    """
    in_file = codecs.open(tsv_file, encoding='utf-8')
    out_file_name = basename(tsv_file) + '.csv'
    out_file = open(out_file_name, 'w')
    csv_out = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
    n_lines = 0
    for line in in_file:
        n_lines += 1
        (doc_id, seg_id, txt) = line.strip().split('\t')
        row = [str(pk), str(lpk), txt, doc_id, seg_id]
        csv_out.writerow(row)
        pk += 1
    in_file.close()
    out_file.close()
    return n_lines
def generate_csv(pk,lpk,tsv_file):
    """ Convert tsv to csv. Column order is:
        header_row = ['id','lang_id','txt','doc','seg']

    Args:
    Returns:
    Raises:
    """
    in_file = codecs.open(tsv_file,encoding='utf-8')
    out_file_name = basename(tsv_file) + '.csv'
    out_file = open(out_file_name,'w')
    csv_out = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
    n_lines = 0
    for line in in_file:
        n_lines += 1
        (doc_id, seg_id, txt) = line.strip().split('\t')
        row = [str(pk), str(lpk), txt, doc_id, seg_id]
        csv_out.writerow(row)
        pk += 1
    in_file.close()
    out_file.close()
    return n_lines
def make_frame(directory, user_ids, out_prefix, ref_filename, ranking_file):
    """

    Args:
    Returns:
    Raises:
    """
    rankings = ptm_file_io.load_ranking_file(ranking_file)
    
    with codecs.open(ref_filename,encoding='utf-8') as ref_infile:
        ref_file = [x.strip() for x in ref_infile.readlines()]
    
    dir_name = basename(dirname(directory))
    out_file_name = '%s.%s.csv' % (out_prefix, dir_name)
    with open(out_file_name,'w') as out_file:
        csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
        write_header = True
        for user_id in user_ids:
            for row in get_rows(directory, user_id, ref_file, rankings):
                if write_header:
                    write_header = False
                    csv_file.writerow(list(row._fields))
                csv_file.writerow([x for x in row._asdict().itervalues()])
Beispiel #9
0
def write_rows_to_csv(row_list, filename):
    """
    Write a list of namedtuple objects to a (unicode) CSV file.
    
    Args:
     row_list A list of namedtuple objects
    Returns:
    Raises:
    """
    with open(filename, 'w') as out_file:
        csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
        write_header = True
        for row in row_list:
            if write_header:
                write_header = False
                csv_file.writerow(list(row._fields))
            csv_file.writerow([x for x in row._asdict().itervalues()])
def make_frame(directory, user_ids, out_prefix, ref_filename, ranking_file):
    """

    Args:
    Returns:
    Raises:
    """
    rankings = ptm_file_io.load_ranking_file(ranking_file)

    with codecs.open(ref_filename, encoding='utf-8') as ref_infile:
        ref_file = [x.strip() for x in ref_infile.readlines()]

    dir_name = basename(dirname(directory))
    out_file_name = '%s.%s.csv' % (out_prefix, dir_name)
    with open(out_file_name, 'w') as out_file:
        csv_file = UnicodeWriter(out_file, quoting=csv.QUOTE_ALL)
        write_header = True
        for user_id in user_ids:
            for row in get_rows(directory, user_id, ref_file, rankings):
                if write_header:
                    write_header = False
                    csv_file.writerow(list(row._fields))
                csv_file.writerow([x for x in row._asdict().itervalues()])
Beispiel #11
0
def test(cursor):
	cursor.execute("""SELECT distinct(source) FROM flows""")
	set_flow= set(cursor)
	print "distinct source in flow", len(set_flow) 

	cursor.execute("""SELECT distinct(source) FROM exchange_rates""")
	set_ex = set(cursor)
	print "distinct source in exchange_rates", len(set_ex)

	cursor.execute("""SELECT slug FROM sources""")
	set_source = set(cursor)

	print "nb elem in source", len(set_source)
 
	missing_flow_source_list = set_flow - set_source

	writer = UnicodeWriter(open(os.path.join("../out_data", "missing_flow_source_list" + ".csv"), "wb"))
	writer.writerows(missing_flow_source_list)


	missing_ex_source_list = set_ex - set_source

	writer = UnicodeWriter(open(os.path.join("../out_data", "missing_ex_source_list" + ".csv"), "wb"))
	writer.writerows(missing_ex_source_list)
Beispiel #12
0
                next_date = dates[i_date + 1]
                if current_date == next_date - 1:
                    pass
                else:
                    periods[-1] = "%s-%s" % (
                        periods[-1], current_date
                    ) if periods[-1] != current_date else str(current_date)
                    periods.append(next_date)

            else:
                # fin 2 : fin de la liste
                periods[-1] = "%s-%s" % (
                    periods[-1], current_date
                ) if periods[-1] != current_date else str(current_date)

        row[2] = ",".join(periods)
        if row[1] == None:
            row[1] = "champs vide"

    return table


writer = UnicodeWriter(
    open(os.path.join("out_data", 'report_by_sources_and_period.csv'), "wb"))
writer.writerow([description[0] for description in c.description])

data = dateByReportingBySource(c.fetchall())
for d in data:
    print d[2]
writer.writerows(data)
Beispiel #13
0
print "-------------------------------------------------------------------------"

print "cleaning done"
conn.commit()
print "commited"
print "-------------------------------------------------------------------------"

################################################################################
##			Export all tables in csv files
################################################################################

tables = [
		"sources",
		"entity_names",
		"RICentities",
		"exchange_rates",
		"currencies",
		"expimp_spegen",
		"RICentities_groups",
		"flows"
		]

for item in tables:
	c.execute("select * from " + item)
	writer = UnicodeWriter(open(os.path.join("out_data", item + ".csv"), "wb"))
	writer.writerow([description[0] for description in c.description])
	# c.fetchall()
	writer.writerows(c)
	print "export " + item + ".csv done"
	print "-------------------------------------------------------------------------"
Beispiel #14
0
def test(cursor):

	#
	# Get distinct source in flows, exchnange_rates and sources
	#
	cursor.execute("""SELECT distinct(source) FROM flows""")
	set_flow= set(_ for _ in cursor)
	print "distinct source in flow", len(set_flow) 

	cursor.execute("""SELECT distinct(source) FROM exchange_rates""")
	set_ex = set(_ for _ in cursor)
	print "distinct source in exchange_rates", len(set_ex)

	cursor.execute("""SELECT distinct(slug) FROM sources""")
	set_source = set(_ for _ in cursor)
	print "nb elem in source", len(set_source)
 
 	#
	# output missing source in flows
	#
	missing_flow_source_list = set_flow - set_source
	print "flow sources missin in source table", len(missing_flow_source_list)
	with codecs.open(os.path.join("../out_data/logs", "missing_flow_source_list" + ".csv"), "wb","UTF8") as f:
		for s in missing_flow_source_list:
			f.write((s[0] if s[0] is not None else u"") +u"\n")

	#
	# output missing source in exchange_rates
	#
	missing_ex_source_list = set_ex - set_source
	print missing_ex_source_list
	print "Exchange rate missing in source table", len(missing_ex_source_list)
	with codecs.open(os.path.join("../out_data/logs", "missing_ex_source_list" + ".csv"), "wb","utf8") as f:
		for s in list(missing_ex_source_list):
			f.write((s[0]  if s[0] is not None else u"") + u"\n")

	#
	# output missing source with id in flows
	#
	missing_flow_source_list_id =[]
	flow_matching = 0

	for row in missing_flow_source_list:
		cursor.execute("""SELECT * FROM flows where source=?""",[row[0]])
		table = [list(r) for r in cursor]
		flow_matching+=1
		for row in table:
			missing_flow_source_list_id.append(row)

	unique_flow = []
	for r in missing_flow_source_list_id:
		if r not in unique_flow:
			unique_flow.append(r)

	writer = UnicodeWriter(open(os.path.join("../out_data/logs", "missing_flow_source_list_id" + ".csv"), "wb"))
	writer.writerows(unique_flow)

	#
	# output missing source with id in exchange_rates
	#
	missing_ex_source_list_id =[]
	ex_matching = 0

	for row in missing_ex_source_list:
		cursor.execute("""SELECT * FROM exchange_rates where source=?""",[row[0]])
		table = [list(r) for r in cursor]
		ex_matching+=1
		for row in table:
			# print row
			missing_ex_source_list_id.append(row)

	unique_ex = []
	for r in missing_ex_source_list_id:
		if r not in unique_ex:
			unique_ex.append(r)

	writer = UnicodeWriter(open(os.path.join("../out_data/logs", "missing_ex_source_list_id" + ".csv"), "wb"))
	writer.writerows(unique_ex)
Beispiel #15
0
            next_source = table[i_source+1][2]
        else:
            current_source = row[0]
            next_source = table[i_source+1][0]
        if (current_source == next_source):
            newCSV.append(row);
        else:
        # csvTitle = unicode(current_source, 'utf-8')
            print newSource
            nameStats.append([current_source, len(newCSV)])
            csvTitle = unicodedata.normalize('NFD', current_source).encode('ascii', 'ignore')
            csvTitle = csvTitle.replace(" ", "_")
            if len(csvTitle) > 255:
                csvTitle = csvTitle[:200]
            try:
                writer = UnicodeWriter(open(os.path.join("./out_data/sources", csvTitle +'.csv'), "w"))
                writer.writerow([description[0] for description in c.description])
                writer.writerows(newCSV)
                newCSV = []
            except IOError as e:
                print "I/O error({0}): {1}".format(e.errno, e.strerror)
                elem = csvTitle.encode('utf8')
                errors.append(elem)
                pass


errorsNameFormat = open('./out_data/errors/errorsNameFormat.txt', 'w')
for item in errors:
    print>>errorsNameFormat, item
print "errorsNameFormat.txt done"