def convert_tsv(valid_list, out_dir, num_processes): encoded_names = [] out_path = [] for path in valid_list: encoded_names.append(path_utilities.str_encode(path)) out_path.append( os.path.join(out_dir, path_utilities.str_encode(path) + ".csv")) # output will look like <encoded_filepath_w/_extension>.csv.<i> print("Converting", len(valid_list), "tsvs to .csv... This may take awhile.") with Pool(num_processes) as p: p.starmap(tsv_action, zip(valid_list, out_path))
def convert_tsv(valid_list, out_dir): for path in tqdm(valid_list): # output will look like <encoded_filepath_w/_extension>.csv.<i> encoded_filename = path_utilities.str_encode(path) out_path = os.path.join(out_dir, encoded_filename) if not os.path.isfile(out_path + ".csv.0"): print("out_path: ", out_path) print("converting") try: # use 'with' if the program isn't going to immediately terminate # so you don't leave files open # the 'b' is necessary on Windows # it prevents \x1a, Ctrl-z, from ending the stream prematurely # and also stops Python converting to / from different line terminators # On other platforms, it has no effect in_txt = csv.reader(open(path, "r"), delimiter='\t') out_csv = csv.writer(open(out_path, 'w')) out_csv.writerows(in_txt) if not os.path.isfile(out_path): print("Did not save converted .tsv correctly. ") except UnicodeDecodeError: continue except MemoryError: print("Memory error, skipping this file. ") continue return
def pdf_action(path, output_dir): transformed_path = path_utilities.str_encode(path) FNULL = open(os.devnull, 'w') if not os.path.isfile(os.path.join(output_dir, transformed_path + ".txt")): subprocess.call([ "ebook-convert", path, os.path.join(output_dir, transformed_path + ".txt") ], stdout=FNULL, stderr=subprocess.STDOUT, close_fds=True)
def docs_action(path, output_dir): transformed_path = path_utilities.str_encode(path) try: if not os.path.isfile( os.path.join(output_dir, transformed_path + ".txt")): f = open(os.path.join(output_dir, transformed_path + ".txt"), "w") contents = textract.process(path).decode("UTF-8").replace( "\n", " ") f.write(contents) f.close() except textract.exceptions.ShellError: print("File skipped due to error")
def convert_pdfs(pdf_paths, dest): num_pdfs = len(pdf_paths) print(num_pdfs, " pdfs for conversion") for path in tqdm(pdf_paths): output_dir = os.path.join(dest, "pdf") if not os.path.isdir(output_dir): os.mkdir(output_dir) transformed_path = path_utilities.str_encode(path) if not os.path.isfile( os.path.join(output_dir, transformed_path + ".txt")): os.system("ebook-convert " + path + " " + os.path.join(output_dir, transformed_path + ".txt"))
def convert_tabular(valid_list, out_dir): for path in tqdm(valid_list): # output will look like <encoded_filepath_w/_extension>.csv.<i> encoded_filename = path_utilities.str_encode(path) out_path = os.path.join(out_dir, encoded_filename) if not os.path.isfile(out_path + ".csv.0"): print("out_path: ", out_path) print("converting") os.system("ssconvert " + path + " " + out_path + ".csv > /dev/null 2>&1 -s") return
def mls_action(filetype, path, output_dir): try: with open(path, 'r', errors="backslashreplace") as content_file: contents = content_file.read() if filetype == "html": soup = BeautifulSoup(contents, 'html.parser') else: soup = BeautifulSoup(contents, 'xml') transformed_path = path_utilities.str_encode(path) if not os.path.isfile( os.path.join(output_dir, transformed_path + ".txt")): f = open(os.path.join(output_dir, transformed_path + ".txt"), "w") f.write(soup.get_text()) f.close() except: print("File skipped due to error.")
def convert_xml(xml_paths, dest): num_xmls = len(xml_paths) print(num_xmls, " xmls for conversion") for path in tqdm(xml_paths): with open(path, 'r', errors="backslashreplace") as content_file: contents = content_file.read() soup = BeautifulSoup(contents, 'xml') output_dir = os.path.join(dest, "xml") if not os.path.isdir(output_dir): os.mkdir(output_dir) transformed_path = path_utilities.str_encode(path) os.chdir(output_dir) if not os.path.isfile( os.path.join(output_dir, transformed_path + ".txt")): f = open(transformed_path + ".txt", "w") f.write(soup.get_text()) f.close()
def convert_docx(docx_paths, dest): num_docxs = len(docx_paths) print(num_docxs, " docxs for conversion") for path in tqdm(docx_paths): try: output_dir = os.path.join(dest, "docx") if not os.path.isdir(output_dir): os.mkdir(output_dir) transformed_path = path_utilities.str_encode(path) os.chdir(output_dir) if not os.path.isfile( os.path.join(output_dir, transformed_path + ".txt")): f = open(transformed_path + ".txt", "w") contents = textract.process(path) f.write(str(contents)) f.close() except textract.exceptions.ShellError: continue
def get_header_dict(csv_dir, csv_path_list, fill_threshold, converted_status): # maps filenames to csv header lists print("Generating structured header dictionary. ") header_dict = {} # number of files with no valid header bad_files = 0 # number of decoding errors while reading csvs decode_probs = 0 # This code is rather confusing because I wanted the function to # be able to handle both types of inputs (lists of paths in names) # and just directory locations. # CASE 1: # If we're reading in converted files, we only need the csv_dir # argument, so we get a list of the filenames from that directory. # These filenames are in the form: # "@home@ljung@pub8@oceans@some_file.csv" if (converted_status): dir_list = os.listdir(csv_dir) # CASE 2: # Otherwise, we are reading in a list of the true, original # locations of files that were csvs to begin with in the dataset. else: dir_list = csv_path_list # CASE 1: "path" looks like:"@home@ljung@pub8@oceans@some_file.csv" # CASE 2: "path" is literally the path of that file in the original # dataset as a string. for path in tqdm(dir_list): if (converted_status): # get the new location of the current file in "csv_dir", # i.e. not in original dataset. filename = path path = os.path.join(csv_dir, path) else: # convert to "@home@ljung@pub8@oceans@some_file.csv" form. filename = str_encode(path) # So now in both cases, filename has the "@"s, and path is # the location of some copy of the file. with open(path, "r") as f: # read csv and get the header as a list reader = csv.reader(f) header_list = [] try: header_list = next(reader) # if the header is empty, try the next line if (len(header_list) == 0): header_list = next(reader) # number of nonempty attribute strings num_nonempty = 0 for attribute in header_list: if not (attribute == ""): num_nonempty = num_nonempty + 1 # if the header is still empty, skip file if (len(header_list) == 0): continue fill_ratio = num_nonempty / len(header_list) # keep checking lines until you get one where there # are enough nonempty attributes while (fill_ratio <= fill_threshold): # if there's only one nonempty attribute, it's # probably just a descriptor of the table, so try # the next line. header_list = next(reader) num_nonempty = 0 for attribute in header_list: if not (attribute == ""): num_nonempty = num_nonempty + 1 if (len(header_list) == 0): fill_ratio = -1 else: fill_ratio = num_nonempty / len(header_list) #=================================================== # Here we've hardcoded some information about # scientific data to work better with CDIAC. # feel free to remove it. # people seem to denote pre-header stuff with a * for attribute in header_list: if (attribute != "" and attribute[-1] == "*"): fill_ratio = -1 if (len(header_list) > 3): if (header_list[0] == "Year" and header_list[2] != ""): break if (header_list[0] == "Citation"): fill_ratio = -1 #=================================================== except UnicodeDecodeError: decode_probs = decode_probs + 1 except StopIteration: bad_files = bad_files + 1 #os.system("cp " + path + " ~/bad_csvs/") continue # throw a key value pair in the dict, with filename as key if header_list == []: continue header_dict.update({filename: header_list}) print("Throwing out this number of files, all have less than ", fill_threshold * 100, "% nonempty cells in every row: ", bad_files) print("Number of UnicodeDecodeErrors: ", decode_probs) print("Dictionary generated. ") return header_dict