def main(args): #Process parameters PARAM_NUM = 3 if len(sys.argv) < PARAM_NUM + 1: print "This script creates a training dataset from a set of positive and negative texts" print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1) print "Usage:", sys.argv[0], "<input-positive> <input-negative> <output-dataset>" print "<input-positive>\t\tAn input CSV file with one text per line (positive training examples)" print "<input-negative>\t\tAn output CSV file with one text per line (negative training examples)" print "<output-dataset>\t\tAn output file with the training dataset in XML format." sys.exit() # Read the command line arguments positive_fpath = sys.argv[1] negative_fpath = sys.argv[2] output_fpath = sys.argv[3] # Lemmatize positive and negative texts lemmatize_file(positive_fpath, output_fpath + ".positive.xml", True, True) lemmatize_file(negative_fpath, output_fpath + ".negative.xml", True, False) # Cat xml files cat_files(output_fpath + ".positive.xml", output_fpath + ".negative.xml", output_fpath, "<texts>", "</texts>") # Remove temporary files os.remove(output_fpath + ".positive.xml") os.remove(output_fpath + ".negative.xml") print "Script has finished successfully."
def main(args): #Process parameters PARAM_NUM = 3 if len(sys.argv) < PARAM_NUM + 1: print "This script creates train and test datasets from a set of positive and negative texts samples" print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1) print "Usage:", sys.argv[0], "<input-positive-texts> <input-negative-texts> <output-test> <output-train> [<split-fraction>]" print "<input-positive>\t\tAn input CSV file with one text per line (positive training examples)" print "<input-negative>\t\tAn output CSV file with one text per line (negative training examples)" print "<output-dir>\t\tAn output directory with a dataset ready to train." print "<split-fraction>\tPercent of texts in the <output-test> (in (0;1), default 0.9 => 10/90)" sys.exit() # Read the command line arguments positive_fpath = sys.argv[1] negative_fpath = sys.argv[2] output_fpath = sys.argv[3] test_fpath = output_fpath + "/train.xml" train_fpath = output_fpath + "/valid.xml" if len(sys.argv) > PARAM_NUM + 1: SPLIT_PERCENT = float(sys.argv[PARAM_NUM + 1]) else: SPLIT_PERCENT = 0.9 # Initialize the directory if not os.path.exists(output_fpath): os.mkdir(output_fpath) shutil.copy2('./../data/test/stopos.csv', output_fpath + '/stopos.csv') shutil.copy2('./../data/test/stopwords.csv', output_fpath + '/stopwords.csv') shutil.copy2('./../data/test/relations.csv', output_fpath + '/relations.csv') # Split positive and negative texts split_csv_texts(positive_fpath, output_fpath + "/positive.train.csv", output_fpath + "/positive.test.csv", SPLIT_PERCENT) split_csv_texts(negative_fpath, output_fpath + "/negative.train.csv", output_fpath + "/negative.test.csv", SPLIT_PERCENT) # Lemmatize positive and negative texts lemmatize_file(output_fpath + "/positive.test.csv", output_fpath + "/positive.test.xml", True, True) lemmatize_file(output_fpath + "/positive.train.csv", output_fpath + "/positive.train.xml", True, True) lemmatize_file(output_fpath + "/negative.test.csv", output_fpath + "/negative.test.xml", True, False) lemmatize_file(output_fpath + "/negative.train.csv", output_fpath + "/negative.train.xml", True, False) # Cat test files cat_files(output_fpath + "/positive.test.xml", output_fpath + "/negative.test.xml", test_fpath, "<texts>", "</texts>") cat_files(output_fpath + "/positive.train.xml", output_fpath + "/negative.train.xml", train_fpath, "<texts>", "</texts>") # Remove temporary files os.remove(output_fpath + "/positive.test.csv") os.remove(output_fpath + "/positive.train.csv") os.remove(output_fpath + "/negative.test.csv") os.remove(output_fpath + "/negative.train.csv") os.remove(output_fpath + "/positive.test.xml") os.remove(output_fpath + "/positive.train.xml") os.remove(output_fpath + "/negative.test.xml") os.remove(output_fpath + "/negative.train.xml") print "Script has finished successfully."
#Process parameters PARAM_NUM = 3 if len(sys.argv) < PARAM_NUM + 1: print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1) print "Usage:", sys.argv[0], "<dbpedia-titles-file> <csv-titles-file> <titles-number> [<skip-criterion>]" print "<dbpedia-titles-file>\tInput: DBPedia file with Wikipedia titles" print "<xml-titles-file>\tOuput: XML filenames file with 0.9*<titles-number> of entries. Two files will be written -- <xml-titles-file>.train and <xml-title-file>.valid (0.1*<titles-number>)" print "<titles-number>\t\tNumber of titles out of all input titles" print "<skip-criterion>\tIf 0 all titles are keps, 1 with at least one space, 2 with at least two space, 3 ..." sys.exit() # Read the command line arguments dbpedia_fpath = sys.argv[1] output_fpath = sys.argv[2] titles_number = int(sys.argv[3]) if len(sys.argv) > 4: SKIP_CRITERION = int(sys.argv[4]) # Temporary files all_titles_fpath = output_fpath + ".all.csv" titles_fpath = output_fpath + ".csv" cleanup_dbpedia_titles(dbpedia_fpath, all_titles_fpath) get_random_sample(all_titles_fpath, titles_fpath, titles_number) split_csv_texts(titles_fpath, titles_fpath + ".valid", titles_fpath + ".train", 0.1) lemmatize_file(titles_fpath + ".valid", output_fpath + ".valid", True, False) lemmatize_file(titles_fpath + ".train", output_fpath + ".train", True, False) print "Script has finished successfully."
# Read the command line arguments positive_fpath = sys.argv[1] negative_fpath = sys.argv[2] test_fpath = sys.argv[3] train_fpath = sys.argv[4] if len(sys.argv) > 5: SPLIT_PERCENT = float(sys.argv[5]) else: SPLIT_PERCENT = 0.1 # Split positive and negative texts split_csv_texts(positive_fpath, positive_fpath + ".test", positive_fpath + ".train", SPLIT_PERCENT) split_csv_texts(negative_fpath, negative_fpath + ".test", negative_fpath + ".train", SPLIT_PERCENT) # Lemmatize positive and negative texts lemmatize_file(positive_fpath + ".test", positive_fpath + ".test.xml", True, True) lemmatize_file(positive_fpath + ".train", positive_fpath + ".train.xml", True, True) lemmatize_file(negative_fpath + ".test", negative_fpath + ".test.xml", True, False) lemmatize_file(negative_fpath + ".train", negative_fpath + ".train.xml", True, False) # Cat test files cat_files(positive_fpath + ".test.xml", negative_fpath + ".test.xml", test_fpath, "<texts>", "</texts>") cat_files(positive_fpath + ".train.xml", negative_fpath + ".train.xml", train_fpath, "<texts>", "</texts>") # Remove temporary files os.remove(positive_fpath + ".test") os.remove(positive_fpath + ".train") os.remove(negative_fpath + ".test") os.remove(negative_fpath + ".train") os.remove(positive_fpath + ".test.xml") os.remove(positive_fpath + ".train.xml")
print "<output-file> XML file with lemmatized filenames (UTF8)" sys.exit() # Read the command line arguments xml_file = sys.argv[1] output_file = sys.argv[2] # Process the data tmp_file_1 = output_file + ".tmp1" tmp_file_2 = output_file + ".tmp2" tmp_file_3 = output_file + ".csv" escape_xml_file(xml_file, tmp_file_1) extract_text(tmp_file_1, tmp_file_2) # Remove duplicate lines from tmp_file_2 and write uniq to tmp_file_3 uniqlines = set(open(tmp_file_2).readlines()) bar = open(tmp_file_3, "w") bar.writelines(set(uniqlines)) bar.close() split_csv_texts(tmp_file_3, tmp_file_3 + ".valid", tmp_file_3 + ".train", 0.1) lemmatize_file(tmp_file_3 + ".valid", output_file + ".valid", True, True) lemmatize_file(tmp_file_3 + ".train", output_file + ".train", True, True) os.remove(tmp_file_1) os.remove(tmp_file_2) # os.remove(tmp_file_3) print "Script has finished successfully."