def main():
    global index_dir, fp_id_title, max_file
    wiki_dump = sys.argv[1]
    index_dir = sys.argv[2]

    if not os.path.isdir(index_dir):
        os.mkdir(index_dir)

    parser = xml.sax.make_parser()  #SAX Parser
    handler = WikiHandler()
    parser.setContentHandler(handler)
    start = time.time()
    fp_id_title = open_id_title(id_to_title)
    parser.parse(wiki_dump)
    write_remaining()
    # print("Number of iterations : ", len(os.listdir(index_dir)))
    fp_id_title.close()
    # itr = len(os.listdir(index_dir)) - 1
    # print("Number of iterations : ", itr)
    merge.merge_files(total_itr, index_dir, max_file)
    clear_directory(total_itr, index_dir)
    end = time.time()

    total_docs = open(os.path.join(index_dir, "total_docs.txt"), "w")
    total_docs.write(str(nod))
    total_docs.close()

    # total_folders = open(os.path.join(index_dir, "total_folders.txt"), "w")
    # total_folders.write(str(total_itr))
    # total_folders.close()
    print("Time taken : ", end - start)
Esempio n. 2
0
def main():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[logging.FileHandler("debug.log"),
                  logging.StreamHandler()])

    logger = logging.getLogger(__name__)
    logger.info('start the program')

    logger.info('getting the urls')
    file_url = open_file()
    logger.info('got  the urls and ready to download the sounds')

    file_download = sheet_url()

    logger.info(f'got {file_url}, ready to download')
    download_videos = download_youtube_files(logger, file_download)
    logger.info(f'got {download_videos}, ready to merge')

    list_for_csv = download_videos[0]
    make_file_csv(list_for_csv)
    insert_row_col(list_for_csv)
    insert_info(logger, list_for_csv)
    logger.info('finished making the csv file')

    list_for_mananger = download_videos[1]
    mananger_sounds(list_for_mananger)
    merge_files(logger, list_for_mananger, 'merge_alarm.mp3')
    logger.info('finished merge the sounds')

    upload_tweet(list_for_mananger)

    logger.info(f'end to the program')
Esempio n. 3
0
 def _merge(self):
     # On fait une jolie liste sortie comme on veut
     new_data = sorted(self._data, key=itemgetter('start'))
     # On merge
     file_list = [self._out_filename + '/' + d['out'] for d in new_data]
     merge_files(file_list, self._out_filename + '.new')
     # On vire le répertoire
     shutil.rmtree(self._out_filename)
     # On renomme le fichier
     os.rename(self._out_filename + '.new', self._out_filename)
Esempio n. 4
0
def merge_pdfs():
    fn = ""
    if verify_output_file_not_exist:
        messagebox.showerror(
            "File exists",
            "Error. File already exists. Try again with a different filename.")
        return
    if not e.get().endswith(".pdf"):
        fn = e.get() + ".pdf"
    else:
        fn = e.get()
    mg.merge_files(listbox.get(0, END), fn)
    messagebox.showinfo("File merged", "Saved output file " + fn)
Esempio n. 5
0
    def test_ordered_input_files(self):
        input_filenames = [
            'input/a.csv',
            'input/b.csv',
            'input/c.csv',
        ]
        generate_input_files(input_filenames, 15)

        for input_filename in input_filenames:
            self.assertTrue(input_filename)

        output_filename = 'output/bar.csv'
        with open(output_filename, 'w') as output_fp:
            merge_files(input_filenames, output_fp.write)

        self.assertTrue(output_filename)
Esempio n. 6
0

if __name__ == '__main__':
	# First, lets handle the arguments"
	parser = argparse.ArgumentParser(description='Sort a huge file.')
	parser.add_argument('--input', help='File to sort')
	parser.add_argument('--output', help='Output file')
	parser.add_argument('--tempfile', help='Temporarily output pattern prefix (default: output)', default='output')
	parser.add_argument('--splitsize', help='Number of bytes in each split (default: 10000)', type=int, default=10000)
	args = parser.parse_args()

	# Let's split up the files in manageable smaller files
	splitted_files = split_file(args.input, '%s_{0:04d}.txt' % args.tempfile, args.splitsize)	

	# Sort each individual file
	for split_file in splitted_files:
		sort(split_file, "%s_sorted" % split_file)	
	
	splitted_files_sorted =  ["%s_sorted" % filename for filename in splitted_files]

	# Merge all the files together again
	merge_files(args.output, splitted_files_sorted)

	# Let's clean up the mess we have temporarily created
	for filename in splitted_files + splitted_files_sorted:
		os.remove(filename)

	# Tada
	print "success"

Esempio n. 7
0
    tmp_files = []
    # filters by PPG ID "producao" and "prod-autor" CSV files for both "artpe" and "anais" publication types
    for file_type in ["producao", "prod-autor"]:
        for prod_type in ["artpe", "anais"]:
            inputfile = f"data/{file_type}-2017a2020-{prod_type}.csv"
            outputfile = f"data/{file_type}-2017a2020-{prod_type}-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv"
            filter.filter_file(inputfile, "CD_PROGRAMA_IES",
                               ppg["CD_PROGRAMA_IES"], outputfile, True)
            tmp_files.append(outputfile)

    # merge filtered files joining "artpe" and "anais" publication types
    for file_type in ["producao", "prod-autor"]:
        basefile = f"data/{file_type}-2017a2020-anais-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv"
        otherfile = f"data/{file_type}-2017a2020-artpe-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv"
        outputfile = f"data/{file_type}-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv"
        merge.merge_files(basefile, otherfile, outputfile, True)

    # normalize author names
    inputfile = f"data/prod-autor-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv"
    outputfile = f"data/prod-autor-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}-normalized.csv"
    normalize.normalize_names(inputfile, outputfile)
    os.replace(outputfile, inputfile)

    # create graph from merged files
    authorsfile = f"data/prod-autor-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv"
    papersfile = f"data/producao-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv"
    outputfile = f"data/graph-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}-2017-2020.json"
    sucupira.export_graph(authorsfile, papersfile, outputfile)

    # cleaning up
    for f in tmp_files:
Esempio n. 8
0
#from merge import merge_files

import merge

x = merge.merge_files('a1.txt',
                      'a2.txt',
                      'a3.txt',
                      'a33.txt',
                      output='output.txt')