#Process parameters PARAM_NUM = 3 if len(sys.argv) < PARAM_NUM + 1: print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1) print "Usage:", sys.argv[0], "<dbpedia-titles-file> <csv-titles-file> <titles-number> [<skip-criterion>]" print "<dbpedia-titles-file>\tInput: DBPedia file with Wikipedia titles" print "<xml-titles-file>\tOuput: XML filenames file with 0.9*<titles-number> of entries. Two files will be written -- <xml-titles-file>.train and <xml-title-file>.valid (0.1*<titles-number>)" print "<titles-number>\t\tNumber of titles out of all input titles" print "<skip-criterion>\tIf 0 all titles are keps, 1 with at least one space, 2 with at least two space, 3 ..." sys.exit() # Read the command line arguments dbpedia_fpath = sys.argv[1] output_fpath = sys.argv[2] titles_number = int(sys.argv[3]) if len(sys.argv) > 4: SKIP_CRITERION = int(sys.argv[4]) # Temporary files all_titles_fpath = output_fpath + ".all.csv" titles_fpath = output_fpath + ".csv" cleanup_dbpedia_titles(dbpedia_fpath, all_titles_fpath) get_random_sample(all_titles_fpath, titles_fpath, titles_number) split_csv_texts(titles_fpath, titles_fpath + ".valid", titles_fpath + ".train", 0.1) lemmatize_file(titles_fpath + ".valid", output_fpath + ".valid", True, False) lemmatize_file(titles_fpath + ".train", output_fpath + ".train", True, False) print "Script has finished successfully."
print "<output-file> XML file with lemmatized filenames (UTF8)" sys.exit() # Read the command line arguments xml_file = sys.argv[1] output_file = sys.argv[2] # Process the data tmp_file_1 = output_file + ".tmp1" tmp_file_2 = output_file + ".tmp2" tmp_file_3 = output_file + ".csv" escape_xml_file(xml_file, tmp_file_1) extract_text(tmp_file_1, tmp_file_2) # Remove duplicate lines from tmp_file_2 and write uniq to tmp_file_3 uniqlines = set(open(tmp_file_2).readlines()) bar = open(tmp_file_3, "w") bar.writelines(set(uniqlines)) bar.close() split_csv_texts(tmp_file_3, tmp_file_3 + ".valid", tmp_file_3 + ".train", 0.1) lemmatize_file(tmp_file_3 + ".valid", output_file + ".valid", True, True) lemmatize_file(tmp_file_3 + ".train", output_file + ".train", True, True) os.remove(tmp_file_1) os.remove(tmp_file_2) # os.remove(tmp_file_3) print "Script has finished successfully."