#Process parameters
PARAM_NUM = 3
if len(sys.argv) < PARAM_NUM + 1:
    print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1)
    print "Usage:", sys.argv[0], "<dbpedia-titles-file> <csv-titles-file> <titles-number> [<skip-criterion>]"
    print "<dbpedia-titles-file>\tInput: DBPedia file with Wikipedia titles"
    print "<xml-titles-file>\tOuput: XML filenames file with 0.9*<titles-number> of entries. Two files will be written -- <xml-titles-file>.train and <xml-title-file>.valid (0.1*<titles-number>)"
    print "<titles-number>\t\tNumber of titles out of all input titles"
    print "<skip-criterion>\tIf 0 all titles are keps, 1 with at least one space, 2 with at least two space, 3 ..."

    sys.exit()

# Read the command line arguments
dbpedia_fpath = sys.argv[1]
output_fpath = sys.argv[2]
titles_number = int(sys.argv[3])
if len(sys.argv) > 4:
	SKIP_CRITERION = int(sys.argv[4])

# Temporary files
all_titles_fpath = output_fpath + ".all.csv"
titles_fpath = output_fpath + ".csv"

cleanup_dbpedia_titles(dbpedia_fpath, all_titles_fpath)
get_random_sample(all_titles_fpath, titles_fpath, titles_number)
split_csv_texts(titles_fpath, titles_fpath + ".valid", titles_fpath + ".train", 0.1)
lemmatize_file(titles_fpath + ".valid", output_fpath + ".valid", True, False)
lemmatize_file(titles_fpath + ".train", output_fpath + ".train", True, False)

print "Script has finished successfully."
    print "<output-file>	XML file with lemmatized filenames (UTF8)"
    sys.exit()

# Read the command line arguments
xml_file = sys.argv[1]
output_file = sys.argv[2]

# Process the data
tmp_file_1 = output_file + ".tmp1"
tmp_file_2 = output_file + ".tmp2"
tmp_file_3 = output_file + ".csv"

escape_xml_file(xml_file, tmp_file_1)
extract_text(tmp_file_1, tmp_file_2)

# Remove duplicate lines from tmp_file_2 and write uniq to tmp_file_3
uniqlines = set(open(tmp_file_2).readlines())
bar = open(tmp_file_3, "w")
bar.writelines(set(uniqlines))
bar.close()

split_csv_texts(tmp_file_3, tmp_file_3 + ".valid", tmp_file_3 + ".train", 0.1)
lemmatize_file(tmp_file_3 + ".valid", output_file + ".valid", True, True)
lemmatize_file(tmp_file_3 + ".train", output_file + ".train", True, True)

os.remove(tmp_file_1)
os.remove(tmp_file_2)
# os.remove(tmp_file_3)

print "Script has finished successfully."