def convert_tsv(valid_list, out_dir, num_processes):
    encoded_names = []
    out_path = []
    for path in valid_list:
        encoded_names.append(path_utilities.str_encode(path))
        out_path.append(
            os.path.join(out_dir,
                         path_utilities.str_encode(path) + ".csv"))
    # output will look like <encoded_filepath_w/_extension>.csv.<i>
    print("Converting", len(valid_list),
          "tsvs to .csv... This may take awhile.")
    with Pool(num_processes) as p:
        p.starmap(tsv_action, zip(valid_list, out_path))
def convert_tsv(valid_list, out_dir):

    for path in tqdm(valid_list):

        # output will look like <encoded_filepath_w/_extension>.csv.<i>
        encoded_filename = path_utilities.str_encode(path)
        out_path = os.path.join(out_dir, encoded_filename)
        if not os.path.isfile(out_path + ".csv.0"):
            print("out_path: ", out_path)
            print("converting")
            try:

                # use 'with' if the program isn't going to immediately terminate
                # so you don't leave files open
                # the 'b' is necessary on Windows
                # it prevents \x1a, Ctrl-z, from ending the stream prematurely
                # and also stops Python converting to / from different line terminators
                # On other platforms, it has no effect
                in_txt = csv.reader(open(path, "r"), delimiter='\t')
                out_csv = csv.writer(open(out_path, 'w'))

                out_csv.writerows(in_txt)
                if not os.path.isfile(out_path):
                    print("Did not save converted .tsv correctly. ")
            except UnicodeDecodeError:
                continue
            except MemoryError:
                print("Memory error, skipping this file. ")
                continue
    return
def pdf_action(path, output_dir):
    transformed_path = path_utilities.str_encode(path)
    FNULL = open(os.devnull, 'w')
    if not os.path.isfile(os.path.join(output_dir, transformed_path + ".txt")):
        subprocess.call([
            "ebook-convert", path,
            os.path.join(output_dir, transformed_path + ".txt")
        ],
                        stdout=FNULL,
                        stderr=subprocess.STDOUT,
                        close_fds=True)
def docs_action(path, output_dir):
    transformed_path = path_utilities.str_encode(path)
    try:
        if not os.path.isfile(
                os.path.join(output_dir, transformed_path + ".txt")):
            f = open(os.path.join(output_dir, transformed_path + ".txt"), "w")
            contents = textract.process(path).decode("UTF-8").replace(
                "\n", " ")
            f.write(contents)
            f.close()
    except textract.exceptions.ShellError:
        print("File skipped due to error")
def convert_pdfs(pdf_paths, dest):
    num_pdfs = len(pdf_paths)
    print(num_pdfs, " pdfs for conversion")
    for path in tqdm(pdf_paths):
        output_dir = os.path.join(dest, "pdf")
        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)
        transformed_path = path_utilities.str_encode(path)
        if not os.path.isfile(
                os.path.join(output_dir, transformed_path + ".txt")):
            os.system("ebook-convert " + path + " " +
                      os.path.join(output_dir, transformed_path + ".txt"))
def convert_tabular(valid_list, out_dir):

    for path in tqdm(valid_list):

        # output will look like <encoded_filepath_w/_extension>.csv.<i>
        encoded_filename = path_utilities.str_encode(path)
        out_path = os.path.join(out_dir, encoded_filename)
        if not os.path.isfile(out_path + ".csv.0"):
            print("out_path: ", out_path)
            print("converting")
            os.system("ssconvert " + path + " " + out_path +
                      ".csv > /dev/null 2>&1 -s")
    return
def mls_action(filetype, path, output_dir):
    try:
        with open(path, 'r', errors="backslashreplace") as content_file:
            contents = content_file.read()
            if filetype == "html":
                soup = BeautifulSoup(contents, 'html.parser')
            else:
                soup = BeautifulSoup(contents, 'xml')
            transformed_path = path_utilities.str_encode(path)
            if not os.path.isfile(
                    os.path.join(output_dir, transformed_path + ".txt")):
                f = open(os.path.join(output_dir, transformed_path + ".txt"),
                         "w")
                f.write(soup.get_text())
                f.close()
    except:
        print("File skipped due to error.")
def convert_xml(xml_paths, dest):
    num_xmls = len(xml_paths)
    print(num_xmls, " xmls for conversion")
    for path in tqdm(xml_paths):
        with open(path, 'r', errors="backslashreplace") as content_file:
            contents = content_file.read()
            soup = BeautifulSoup(contents, 'xml')
            output_dir = os.path.join(dest, "xml")
            if not os.path.isdir(output_dir):
                os.mkdir(output_dir)
            transformed_path = path_utilities.str_encode(path)
            os.chdir(output_dir)
            if not os.path.isfile(
                    os.path.join(output_dir, transformed_path + ".txt")):
                f = open(transformed_path + ".txt", "w")
                f.write(soup.get_text())
                f.close()
def convert_docx(docx_paths, dest):
    num_docxs = len(docx_paths)
    print(num_docxs, " docxs for conversion")
    for path in tqdm(docx_paths):
        try:
            output_dir = os.path.join(dest, "docx")
            if not os.path.isdir(output_dir):
                os.mkdir(output_dir)
            transformed_path = path_utilities.str_encode(path)
            os.chdir(output_dir)
            if not os.path.isfile(
                    os.path.join(output_dir, transformed_path + ".txt")):
                f = open(transformed_path + ".txt", "w")
                contents = textract.process(path)
                f.write(str(contents))
                f.close()
        except textract.exceptions.ShellError:
            continue
Ejemplo n.º 10
0
def get_header_dict(csv_dir, csv_path_list, fill_threshold, converted_status):

    # maps filenames to csv header lists
    print("Generating structured header dictionary. ")
    header_dict = {}

    # number of files with no valid header
    bad_files = 0

    # number of decoding errors while reading csvs
    decode_probs = 0

    # This code is rather confusing because I wanted the function to
    # be able to handle both types of inputs (lists of paths in names)
    # and just directory locations.

    # CASE 1:
    # If we're reading in converted files, we only need the csv_dir
    # argument, so we get a list of the filenames from that directory.
    # These filenames are in the form:
    # "@home@ljung@pub8@oceans@some_file.csv"
    if (converted_status):
        dir_list = os.listdir(csv_dir)

    # CASE 2:
    # Otherwise, we are reading in a list of the true, original
    # locations of files that were csvs to begin with in the dataset.
    else:
        dir_list = csv_path_list

    # CASE 1: "path" looks like:"@home@ljung@pub8@oceans@some_file.csv"
    # CASE 2: "path" is literally the path of that file in the original
    # dataset as a string.
    for path in tqdm(dir_list):
        if (converted_status):

            # get the new location of the current file in "csv_dir",
            # i.e. not in original dataset.
            filename = path
            path = os.path.join(csv_dir, path)
        else:

            # convert to "@home@ljung@pub8@oceans@some_file.csv" form.
            filename = str_encode(path)

        # So now in both cases, filename has the "@"s, and path is
        # the location of some copy of the file.
        with open(path, "r") as f:

            # read csv and get the header as a list
            reader = csv.reader(f)
            header_list = []
            try:
                header_list = next(reader)

                # if the header is empty, try the next line
                if (len(header_list) == 0):
                    header_list = next(reader)

                # number of nonempty attribute strings
                num_nonempty = 0
                for attribute in header_list:
                    if not (attribute == ""):
                        num_nonempty = num_nonempty + 1

                # if the header is still empty, skip file
                if (len(header_list) == 0):
                    continue

                fill_ratio = num_nonempty / len(header_list)

                # keep checking lines until you get one where there
                # are enough nonempty attributes
                while (fill_ratio <= fill_threshold):

                    # if there's only one nonempty attribute, it's
                    # probably just a descriptor of the table, so try
                    # the next line.
                    header_list = next(reader)
                    num_nonempty = 0
                    for attribute in header_list:
                        if not (attribute == ""):
                            num_nonempty = num_nonempty + 1
                    if (len(header_list) == 0):
                        fill_ratio = -1
                    else:
                        fill_ratio = num_nonempty / len(header_list)

                    #===================================================
                    # Here we've hardcoded some information about
                    # scientific data to work better with CDIAC.
                    # feel free to remove it.

                    # people seem to denote pre-header stuff with a *
                    for attribute in header_list:
                        if (attribute != "" and attribute[-1] == "*"):
                            fill_ratio = -1
                    if (len(header_list) > 3):
                        if (header_list[0] == "Year" and header_list[2] != ""):
                            break
                        if (header_list[0] == "Citation"):
                            fill_ratio = -1
                    #===================================================

            except UnicodeDecodeError:
                decode_probs = decode_probs + 1
            except StopIteration:
                bad_files = bad_files + 1
                #os.system("cp " + path + " ~/bad_csvs/")
                continue

            # throw a key value pair in the dict, with filename as key
            if header_list == []:
                continue
            header_dict.update({filename: header_list})
    print("Throwing out this number of files, all have less than ",
          fill_threshold * 100, "% nonempty cells in every row: ", bad_files)
    print("Number of UnicodeDecodeErrors: ", decode_probs)
    print("Dictionary generated. ")
    return header_dict