Example #1
0
def main():
    in_d, out_d,m,n = utils.argsdirs("Most frequent triples",["n"])
    n = int(n)
    tomes = [
        triple.Tome(filename)
        for filename
        in utils.filenames(in_d)
    ]
    
    filename_out = utils.new_filename(out_d,"most_frequent.gz")
    tome_out = triple.Tome(filename_out)
    
    print "joining the tomes.."
    tome_join = triple.Tome(tomes)
    print "grouping/summing (again).."
    tome_join = tome_join.group_sum(m)
    print "sorting the tomes (again).."
    tome_join = tome_join.sort()
    print "getting the first %d.."%n
    tome_join = tome_join.first(n)
    print "writing everything down.."
    writer = tome_out.writer()
    for tr in tome_join:
        writer(tr)
    print "done."
Example #2
0
def main():
    in_d, out_d,_ = utils.argsdirs("Sorting")
    
    for filename in utils.filenames(in_d):
        tome_in = triple.Tome(filename)
        filename_out = utils.new_filename(out_d,filename)
        tome_out = triple.Tome(filename_out)
        writer = tome_out.writer()
        for tr in tome_in.sort():
            writer(tr)
Example #3
0
def main():
    in_d, out_d, members_groupby = utils.argsdirs("Counting the triples")
    
    for filename in utils.filenames(in_d):
        print "processing file %s.."%filename
        tome_in = triple.Tome(filename)
        filename_out = utils.new_filename(out_d,filename)
        print "writing to %s.."%filename_out
        tome_out = triple.Tome(filename_out)
        
        writer = tome_out.writer()
        for tr in tome_in.group_sum(members_groupby):
            writer(tr)
def prepare_tomes(in_d):

    tomes = [
        triple.Tome(filename)
        for filename
        in utils.filenames(in_d)
    ]

    print "number of tomes found: %d"%len(tomes)

    tv = triple.TomeVoc(tomes)
    word_indexes = tv.indexes
    return tv
Example #5
0
def main ():
    commandline_parser = argparse.ArgumentParser("Pre-processing of data")

    commandline_parser.add_argument("--data-folder", nargs =1, help="Specifies the path of the folder containing the data.")
    
    commandline_parser.add_argument("--output-folder", nargs =1, help="Specifies the path of the output folder.")
    
    args = vars(commandline_parser.parse_args())
    data_folder = args["data_folder"][0]
    output_folder = args["output_folder"][0]
    output_folder = path.join(output_folder,'dataset')
    if not path.exists(output_folder):
        makedirs(output_folder)
    files = utils.filenames(data_folder)
    for file_path in files:
        output_path = utils.new_filename(output_folder, file_path)
        preprocess(file_path, output_path)