Exemple #1
0
def main(args):

    #Process parameters
    PARAM_NUM = 3 
    if len(sys.argv) < PARAM_NUM + 1:
        print "This script creates a training dataset from a set of positive and negative texts"
        print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1)
        print "Usage:", sys.argv[0], "<input-positive> <input-negative> <output-dataset>"
        print "<input-positive>\t\tAn input CSV file with one text per line (positive training examples)"
        print "<input-negative>\t\tAn output CSV file with one text per line (negative training examples)"
        print "<output-dataset>\t\tAn output file with the training dataset in XML format."
        sys.exit()

    # Read the command line arguments
    positive_fpath = sys.argv[1]
    negative_fpath = sys.argv[2]
    output_fpath = sys.argv[3] 
    
    # Lemmatize positive and negative texts
    lemmatize_file(positive_fpath, output_fpath + ".positive.xml", True, True)
    lemmatize_file(negative_fpath, output_fpath + ".negative.xml", True, False)

    # Cat xml files 
    cat_files(output_fpath + ".positive.xml", output_fpath + ".negative.xml", output_fpath, "<texts>", "</texts>")

    # Remove temporary files
    os.remove(output_fpath + ".positive.xml")
    os.remove(output_fpath + ".negative.xml")
    
    print "Script has finished successfully."
Exemple #2
0
def main(args):

    #Process parameters
    PARAM_NUM = 3 
    if len(sys.argv) < PARAM_NUM + 1:
        print "This script creates train and test datasets from a set of positive and negative texts samples"
        print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1)
        print "Usage:", sys.argv[0], "<input-positive-texts> <input-negative-texts> <output-test> <output-train> [<split-fraction>]"
        print "<input-positive>\t\tAn input CSV file with one text per line (positive training examples)"
        print "<input-negative>\t\tAn output CSV file with one text per line (negative training examples)"
        print "<output-dir>\t\tAn output directory with a dataset ready to train."
        print "<split-fraction>\tPercent of texts in the <output-test> (in (0;1), default 0.9 => 10/90)"
        sys.exit()

    # Read the command line arguments
    positive_fpath = sys.argv[1]
    negative_fpath = sys.argv[2]
    output_fpath = sys.argv[3] 
    test_fpath = output_fpath + "/train.xml"
    train_fpath = output_fpath + "/valid.xml"
    if len(sys.argv) > PARAM_NUM + 1:
        SPLIT_PERCENT = float(sys.argv[PARAM_NUM + 1])
    else:
        SPLIT_PERCENT = 0.9

    # Initialize the directory
    if not os.path.exists(output_fpath):
        os.mkdir(output_fpath)
    shutil.copy2('./../data/test/stopos.csv', output_fpath + '/stopos.csv')
    shutil.copy2('./../data/test/stopwords.csv', output_fpath + '/stopwords.csv')
    shutil.copy2('./../data/test/relations.csv', output_fpath + '/relations.csv')
    
    # Split positive and negative texts
    split_csv_texts(positive_fpath, output_fpath + "/positive.train.csv", output_fpath + "/positive.test.csv", SPLIT_PERCENT)
    split_csv_texts(negative_fpath, output_fpath + "/negative.train.csv", output_fpath + "/negative.test.csv", SPLIT_PERCENT)

    # Lemmatize positive and negative texts
    lemmatize_file(output_fpath + "/positive.test.csv", output_fpath + "/positive.test.xml", True, True)
    lemmatize_file(output_fpath + "/positive.train.csv", output_fpath + "/positive.train.xml", True, True)
    lemmatize_file(output_fpath + "/negative.test.csv", output_fpath + "/negative.test.xml", True, False)
    lemmatize_file(output_fpath + "/negative.train.csv", output_fpath + "/negative.train.xml", True, False)

    # Cat test files 
    cat_files(output_fpath + "/positive.test.xml", output_fpath + "/negative.test.xml", test_fpath, "<texts>", "</texts>")
    cat_files(output_fpath + "/positive.train.xml", output_fpath + "/negative.train.xml", train_fpath, "<texts>", "</texts>")    

    # Remove temporary files
    os.remove(output_fpath + "/positive.test.csv")
    os.remove(output_fpath + "/positive.train.csv")
    os.remove(output_fpath + "/negative.test.csv")
    os.remove(output_fpath + "/negative.train.csv")
    os.remove(output_fpath + "/positive.test.xml")
    os.remove(output_fpath + "/positive.train.xml")
    os.remove(output_fpath + "/negative.test.xml")
    os.remove(output_fpath + "/negative.train.xml")
    
    print "Script has finished successfully."
#Process parameters
PARAM_NUM = 3
if len(sys.argv) < PARAM_NUM + 1:
    print "Expected", PARAM_NUM, "parameters but was", str(len(sys.argv)-1)
    print "Usage:", sys.argv[0], "<dbpedia-titles-file> <csv-titles-file> <titles-number> [<skip-criterion>]"
    print "<dbpedia-titles-file>\tInput: DBPedia file with Wikipedia titles"
    print "<xml-titles-file>\tOuput: XML filenames file with 0.9*<titles-number> of entries. Two files will be written -- <xml-titles-file>.train and <xml-title-file>.valid (0.1*<titles-number>)"
    print "<titles-number>\t\tNumber of titles out of all input titles"
    print "<skip-criterion>\tIf 0 all titles are keps, 1 with at least one space, 2 with at least two space, 3 ..."

    sys.exit()

# Read the command line arguments
dbpedia_fpath = sys.argv[1]
output_fpath = sys.argv[2]
titles_number = int(sys.argv[3])
if len(sys.argv) > 4:
	SKIP_CRITERION = int(sys.argv[4])

# Temporary files
all_titles_fpath = output_fpath + ".all.csv"
titles_fpath = output_fpath + ".csv"

cleanup_dbpedia_titles(dbpedia_fpath, all_titles_fpath)
get_random_sample(all_titles_fpath, titles_fpath, titles_number)
split_csv_texts(titles_fpath, titles_fpath + ".valid", titles_fpath + ".train", 0.1)
lemmatize_file(titles_fpath + ".valid", output_fpath + ".valid", True, False)
lemmatize_file(titles_fpath + ".train", output_fpath + ".train", True, False)

print "Script has finished successfully."
# Read the command line arguments
positive_fpath = sys.argv[1]
negative_fpath = sys.argv[2]
test_fpath = sys.argv[3]
train_fpath = sys.argv[4]
if len(sys.argv) > 5:
    SPLIT_PERCENT = float(sys.argv[5])
else:
    SPLIT_PERCENT = 0.1

# Split positive and negative texts
split_csv_texts(positive_fpath, positive_fpath + ".test", positive_fpath + ".train", SPLIT_PERCENT)
split_csv_texts(negative_fpath, negative_fpath + ".test", negative_fpath + ".train", SPLIT_PERCENT)

# Lemmatize positive and negative texts
lemmatize_file(positive_fpath + ".test", positive_fpath + ".test.xml", True, True)
lemmatize_file(positive_fpath + ".train", positive_fpath + ".train.xml", True, True)
lemmatize_file(negative_fpath + ".test", negative_fpath + ".test.xml", True, False)
lemmatize_file(negative_fpath + ".train", negative_fpath + ".train.xml", True, False)

# Cat test files 
cat_files(positive_fpath + ".test.xml", negative_fpath + ".test.xml", test_fpath, "<texts>", "</texts>")
cat_files(positive_fpath + ".train.xml", negative_fpath + ".train.xml", train_fpath, "<texts>", "</texts>")    

# Remove temporary files
os.remove(positive_fpath + ".test")
os.remove(positive_fpath + ".train")
os.remove(negative_fpath + ".test")
os.remove(negative_fpath + ".train")
os.remove(positive_fpath + ".test.xml")
os.remove(positive_fpath + ".train.xml")
    print "<output-file>	XML file with lemmatized filenames (UTF8)"
    sys.exit()

# Read the command line arguments
xml_file = sys.argv[1]
output_file = sys.argv[2]

# Process the data
tmp_file_1 = output_file + ".tmp1"
tmp_file_2 = output_file + ".tmp2"
tmp_file_3 = output_file + ".csv"

escape_xml_file(xml_file, tmp_file_1)
extract_text(tmp_file_1, tmp_file_2)

# Remove duplicate lines from tmp_file_2 and write uniq to tmp_file_3
uniqlines = set(open(tmp_file_2).readlines())
bar = open(tmp_file_3, "w")
bar.writelines(set(uniqlines))
bar.close()

split_csv_texts(tmp_file_3, tmp_file_3 + ".valid", tmp_file_3 + ".train", 0.1)
lemmatize_file(tmp_file_3 + ".valid", output_file + ".valid", True, True)
lemmatize_file(tmp_file_3 + ".train", output_file + ".train", True, True)

os.remove(tmp_file_1)
os.remove(tmp_file_2)
# os.remove(tmp_file_3)

print "Script has finished successfully."