def make_subsets(path_data, path_subsets, proportions=None, copy=False):

    if proportions is None:
        proportions = {
            "test": settings.PROPTEST,
            "train": settings.PROPTRAIN,
            "valid": settings.PROPVALID}
    if copy:
        moving = shutil.copy
    else:
        moving = shutil.move

    list_chroms = gt.list_elements(path_data, type_="dir")
    list_chroms_names = [os.path.basename(i).split(".")[0] for i in list_chroms]

    create_subsets_dirs(path_subsets, list_chroms_names)

    for index_1, (chrom, chrom_name) in enumerate(
            zip(list_chroms, list_chroms_names)):

        files = gt.list_elements(
            chrom,
            type_="file",
            extension=".txt.gz",
            exception=[os.path.join(chrom, "_meta.txt.gz")])

        subsets = gt.random_chunks(files, (
            proportions["test"],
            proportions["train"],
            proportions["valid"]))

        test_files, train_files, valid_files = subsets

        test_files_out = [os.path.join(
            path_subsets,
            "Test",
            chrom_name) for _ in range(len(test_files))]
        train_files_out = [os.path.join(
            path_subsets,
            "Train",
            chrom_name) for _ in range(len(train_files))]
        valid_files_out = [os.path.join(
            path_subsets,
            "Valid",
            chrom_name) for _ in range(len(valid_files))]

        for index_2, (in_, out_) in enumerate(
                zip(
                    test_files+train_files+valid_files,
                    test_files_out+train_files_out+valid_files_out)):
            moving(in_, out_)
        shutil.move(
            os.path.join(chrom, "_meta.txt.gz"),
            os.path.join(path_subsets, "_meta_"+chrom_name+".txt.gz"))
        shutil.move(
            os.path.join(chrom, "_comments.txt"),
            os.path.join(path_subsets, "_comments_"+chrom_name+".txt"))
 def test_simple_ls(self):
     # deep copy of the class variable
     files_in_current_folder = self.total_elements[:]
     files_in_current_folder.sort()
     results = gt.list_elements(self.path_to_playground,
                                sort="alphanumeric")
     self.assertEqual(results, files_in_current_folder)
 def test_ls_with_type_file(self):
     files_in_current_folder = self.files["total"][:]
     files_in_current_folder = gt._natural_sort(files_in_current_folder)
     results = gt.list_elements(self.path_to_playground,
                                type_="file",
                                sort="natural")
     self.assertEqual(results, files_in_current_folder)
def example_2(VERBOSE=True):
    subprocess.call("python " +
                    os.path.join(os.path.dirname(__file__), "setup_ex_env.py"),
                    shell=True)
    list_chrs = gt.list_elements(PATH_TO_PLAYGROUND,
                                 type_="file",
                                 extension=".vcf.gz")
    list_chrs = [os.path.basename(i).split(".")[0] for i in list_chrs]

    vcf.split_vcf_files(PATH_TO_PLAYGROUND, verbose=False)

    for chr_to_be_processed in list_chrs:
        print("###########################################\n" +
              "Processing chr {}".format(chr_to_be_processed))
        path_to_data = os.path.join(PATH_TO_PLAYGROUND, "split_by_chr",
                                    str(chr_to_be_processed))
        encoding.encode_file_positions(chr_to_be_processed,
                                       path_to_data,
                                       PATH_TO_PLAYGROUND,
                                       verbose=VERBOSE)
        encoding.verify_decoding(os.path.join(PATH_TO_PLAYGROUND,
                                              "split_by_chr"),
                                 os.path.join(PATH_TO_PLAYGROUND,
                                              "encoded_files"),
                                 str(chr_to_be_processed),
                                 nb_of_tests_per_file=10,
                                 max_nb_of_files_to_test=100,
                                 verbose=VERBOSE)
    def test_ls_with_extensions(self):
        files_in_current_folder = self.files[".py"]
        files_in_current_folder = gt._natural_sort(files_in_current_folder)

        results = gt.list_elements(self.path_to_playground,
                                   sort="natural",
                                   extension=".py")
        self.assertEqual(results, files_in_current_folder)
 def test_ls_with_exception(self):
     files_in_current_folder = self.total_elements[:]
     files_in_current_folder = gt._natural_sort(files_in_current_folder)
     exception = random.sample(files_in_current_folder,
                               len(files_in_current_folder) // 5)
     files_in_current_folder = [
         x for x in files_in_current_folder if x not in exception
     ]
     results = gt.list_elements(self.path_to_playground,
                                sort="natural",
                                exception=exception)
     self.assertEqual(results, files_in_current_folder)
Example #7
0
def _concat_meta(path_in, path_out):
    meta_files = gt.list_elements(path_in, type_="file", extension=".txt.gz")
    header_written = False
    with gzip.open(os.path.join(path_out, "_meta.txt.gz"), 'wb') as out_file:
        for file in meta_files:
                with gzip.open(file, "rb") as in_file:
                    for line in in_file:
                        try:
                            is_comment = line.startswith('#')
                        except TypeError:
                            is_comment = line.startswith(b'#')
                        if not is_comment or not header_written:
                            out_file.write(line)
                            header_written = True
Example #8
0
def sort_chr_files_by_sample(
        path_to_data_by_chr,
        make_copy=False,
        force=False):
    if (
            not path_to_data_by_chr.endswith("split_by_chr") and
            not path_to_data_by_chr[:-1].endswith("split_by_chr") and
            not force):
        raise ValueError(
            "Directory name not recognized, directory name should be" +
            "split_by_chr")
    path_to_data_by_sample = os.path.join(os.path.dirname(
        path_to_data_by_chr),
        "split_by_sample")
    if not os.path.isdir(path_to_data_by_sample):
        os.mkdir(path_to_data_by_sample)
    else:
        shutil.rmtree(path_to_data_by_sample)
        os.mkdir(path_to_data_by_sample)
    chromosomes_dirs = gt.list_elements(path_to_data_by_chr, type_="dir")
    for folder in chromosomes_dirs:
        dir_name = os.path.basename(folder)
        files = gt.list_elements(folder, type_="file")
        for file in files:
            file_name = os.path.basename(file)
            destination = os.path.join(
                path_to_data_by_sample,
                file_name.split(".")[0])
            if not os.path.isdir(destination):
                os.mkdir(destination)
            new_name = os.path.join(destination, "chr"+dir_name+"_"+file_name)
            if make_copy:
                shutil.copy(file, new_name)
            else:
                shutil.move(file, new_name)
    if not make_copy:
        shutil.rmtree(path_to_data_by_chr)
Example #9
0
def concatenate_split_files(
        path_to_input,
        tree_structure="by_sample",
        make_copy=False,
        force=False):
    if not force:
        if (
            (
                not path_to_input.endswith("split_by_chr") and
                not path_to_input[:-1].endswith("split_by_chr") and
                tree_structure == "by_chr") or
            (
                not path_to_input.endswith("split_by_sample") and
                not path_to_input[:-1].endswith("split_by_sample") and
                tree_structure == "by_sample")):
            raise ValueError(
                "Inconsistent combination of path and tree structure.\n" +
                "Received {0} and {1}".format(path_to_input, tree_structure))
        elif (tree_structure != "by_sample") and (tree_structure != "by_chr"):
            raise ValueError(
                "{0} is not managed by this function".format(tree_structure))

    path_to_concat_data = os.path.join(os.path.dirname(
        path_to_input),
        "split_and_concat_data")
    if not os.path.isdir(path_to_concat_data):
        os.mkdir(path_to_concat_data)
    else:
        shutil.rmtree(path_to_concat_data)
        os.mkdir(path_to_concat_data)

    list_users = gt.list_elements(
        path_to_input,
        type_="dir",
        exception=[os.path.join(
            path_to_input, "_meta"),
            os.path.join(path_to_input, "_comments")])

    _concat_meta(
        os.path.join(path_to_input, "_meta"),
        path_to_concat_data)

    _copy_comment(
        os.path.join(path_to_input, "_comments"),
        path_to_concat_data)

    for user in list_users:

        split_files = gt.list_elements(user, type_="file", extension=".txt.gz")
        name_user = os.path.basename(split_files[0])
        name_user = name_user.replace(
            re.sub("[^a-z\d]", "", re.search("^[^_]*", name_user).group(0)),
            "allchr")
        with gzip.open(
                os.path.join(path_to_concat_data, name_user), 'wb') as outfile:
            for file in split_files:
                with gzip.open(file, "rb") as infile:
                    outfile.write(infile.read())
        nb_lines_out = gt.get_nb_lines_file(
            os.path.join(path_to_concat_data, name_user))
        nb_lines_in = 0
        for file in split_files:
            nb_lines_in += gt.get_nb_lines_file(file)
        if nb_lines_in != nb_lines_out:
            sys.stderr.write(
                "Number of lines between original files and the\n" +
                "concatenated file does not match.\n" +
                "Origin: {0}, Concat: {1}\n".format(nb_lines_in, nb_lines_out))
        if not make_copy:
            shutil.rmtree(user)
    if not make_copy:
        shutil.rmtree(path_to_input)
Example #10
0
def _copy_comment(path_in, path_out):
    comment_files = gt.list_elements(path_in, extension=".txt")
    shutil.copy(
        os.path.join(path_in, comment_files[0]),
        os.path.join(path_out, "_comments.txt"))
Example #11
0
def verify_decoding(
        path_to_original_data,
        path_to_encoded_data,
        chromosome_verified,
        max_nb_of_files_to_test=100,
        nb_of_tests_per_file=100,
        verbose=True,
        printing=True,
        logging=False):

    print_parameters = {
        "verbose": verbose,
        "printing": printing,
        "logging": logging,
        "in_loop": False
    }

    errors_file = []
    errors_sup_pos = []
    errors_real_pos = []
    errors_type = []
    errors_prev_pos = []
    errors_next_pos = []

    gt.custom_output(
        "Function {0} started at {1}".format(
            verify_decoding.__name__,
            str(datetime.datetime.now())) +
        "\nTesting files in {0}:".format(
            path_to_encoded_data),
        **print_parameters)
    timer = gt.time_since_first_call()
    next(timer)

    _meta = pd.read_csv(
        os.path.join(
            path_to_original_data,
            chromosome_verified,
            "_meta.txt.gz"),
        sep="\t",
        index_col=False).drop(
        ["#CHROM", "ID", "QUAL", "FILTER", "INFO", "FORMAT"], 1)

    files = gt.list_elements(
        os.path.join(
            path_to_encoded_data,
            chromosome_verified),
        extension=".txt.gz")

    print_parameters["in_loop"] = True
    for j in range(min(max_nb_of_files_to_test, len(files))):
        testfile = random.choice(files)
        name = testfile.split("/")[-1].split(".")[0]
        _meta["originaldata"] = pd.read_csv(
            os.path.join(
                path_to_original_data,
                chromosome_verified,
                name+"_"+name+".txt.gz"),
            index_col=None, header=None)
        _meta["totest"] = pd.read_csv(testfile, index_col=None, header=None)
        for i in range(nb_of_tests_per_file):
            to_test = random.choice(_meta.totest.tolist())
            allele_1, allele_2, position = decode_position_int(to_test)

            if position == -1:
                index = _meta.loc[
                        (_meta.totest == to_test), :].index.tolist()[0]
                errors_file.append(testfile)
                errors_sup_pos.append(position)
                errors_real_pos.append(_meta.iloc[max(index, 0), 0])
                errors_type.append("Impossible to decode")
                errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0])
                errors_next_pos.append(
                    _meta.iloc[min(index + 1, _meta.shape[0]), 0])

                gt.custom_output("{0}/{1} files tested. Date : {2}".format(
                    j+1,
                    min(max_nb_of_files_to_test, len(files)),
                    str(datetime.datetime.now())),
                    **print_parameters)
                continue
            original_alleles =\
                _meta.loc[(_meta.totest == to_test), :]["originaldata"].tolist()[0].split("/")
            original_pos =\
                _meta.loc[(_meta.totest == to_test), :]["POS"].tolist()[0]
            ref = _meta.loc[(_meta.totest == to_test), :]["REF"].tolist()[0]
            alt = _meta.loc[(_meta.totest == to_test), :]["ALT"].tolist()[0]

            if position != original_pos:
               #print("#####################################")
               #print("#####################################")
               #print(allele_1, allele_2, position)
               #print(original_alleles, ref, alt, original_pos)
               #print("#####################################")
               #print("#####################################")

                index = _meta.loc[
                        (_meta.totest == to_test), :].index.tolist()[0]
                errors_file.append(testfile)
                errors_sup_pos.append(position)
                errors_real_pos.append(_meta.iloc[max(index, 0), 0])
                errors_type.append("Position")
                errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0])
                errors_next_pos.append(
                    _meta.iloc[min(index + 1, _meta.shape[0]), 0])
            if ((original_alleles[0] == 0) and (allele_1 != ref)) or\
                    ((original_alleles[0] == 1) and (allele_1 != alt)):
                index = _meta.loc[
                        (_meta.totest == to_test), :].index.tolist()[0]
                errors_file.append(testfile)
                errors_sup_pos.append(position)
                errors_real_pos.append(_meta.iloc[max(index, 0), 0])
                errors_type.append("Allele 1")
                errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0])
                errors_next_pos.append(
                    _meta.iloc[min(index + 1, _meta.shape[0]), 0])
            if ((original_alleles[-1] == 0) and (allele_1 != alt)) or\
                    ((original_alleles[-1] == 1) and (allele_1 != alt)):

                index = _meta.loc[
                        (_meta.totest == to_test), :].index.tolist()[0]
                errors_file.append(testfile)
                errors_sup_pos.append(position)
                errors_real_pos.append(_meta.iloc[max(index, 0), 0])
                errors_type.append("Allele 2")
                errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0])
                errors_next_pos.append(
                    _meta.iloc[min(index + 1, _meta.shape[0]), 0])

        h, m, s = next(timer)
        gt.custom_output(
            "{0}/{1} files tested ".format(
                j+1,
                min(max_nb_of_files_to_test, len(files))) +
            "after  {0}h{1}m{2}s. ".format(h, m, s) +
            "Date : {0}".format(str(datetime.datetime.now())),
            **print_parameters)

    print_parameters["in_loop"] = False

    errors = pd.DataFrame({"File": errors_file,
                           "Supposed_position": errors_sup_pos,
                           "Real_position": errors_real_pos,
                           "Error_type": errors_type,
                           "Previous_positions": errors_prev_pos,
                           "Next_position": errors_next_pos})
    if not errors.empty:
        errors_al_1 = errors.loc[(errors.Error_type == "Allele 1"), :].shape[0]
        errors_al_2 = errors.loc[(errors.Error_type == "Allele 2"), :].shape[0]
        errors_pos = errors.loc[(errors.Error_type == "Position"), :].shape[0]
        impossible_to_decode =\
            errors.loc[
                (errors.Error_type == "Impossible to decode"), :].shape[0]
        total_error = errors.shape[0]
        gt.custom_output(
            "\nAllele 1 errors: {}".format(errors_al_1) +
            "\nAllele 2 errors: {}".format(errors_al_2) +
            "\nPosition errors: {}".format(errors_pos) +
            "\nImpossible to decode: {}".format(impossible_to_decode) +
            "\nTotal errors: {}".format(total_error) +
            "\nIn total: {}% errors !\n".format(
                100*total_error/(nb_of_tests_per_file*min(
                    max_nb_of_files_to_test,
                    len(files)))),
            **print_parameters
        )
        print("Date : {}".format(str(datetime.datetime.now())))
        errors.to_csv(
            "Errors_found_in{}.csv".format(chromosome_verified), sep="\t")
    else:
        gt.custom_output("\nNo error found !", **print_parameters)
 def test_sorted_ls(self):
     files_in_current_folder = self.total_elements[:]
     files_in_current_folder = gt._natural_sort(files_in_current_folder)
     results = gt.list_elements(self.path_to_playground, sort="natural")
     self.assertEqual(results, files_in_current_folder)
Example #13
0
def mask_data(path_data,
              fraction_pass,
              path_output=None,
              prefix_subset=None,
              verbose=False,
              logging=False):
    """ fraction_pass = nb between 0 and 1
    This function builds the output directory based on the name of the input dir
    and the prefix.
    """
    print("Starting to filter data from {0} at {1}. ({2} pass)".format(
        path_data, datetime.datetime.now(), fraction_pass))

    if prefix_subset is None:
        prefix_subset = str(int(100 * fraction_pass)) + "PER_"
    out_dir_name = prefix_subset + os.path.basename(path_data)
    if path_output is None:
        path_output = os.path.join(os.path.dirname(path_data), out_dir_name)
    copy_output_tree_struct(path_data, path_output)

    i = 0

    chromosomes = gt.list_elements(path_data,
                                   type_='dir',
                                   exception=[
                                       os.path.join(path_data, "floatfiles"),
                                       os.path.join(path_data, "encodeddata"),
                                       os.path.join(path_data, "Subsets")
                                   ])

    for chrom in chromosomes:

        chrom_name = os.path.basename(chrom)
        files = gt.list_elements(chrom, extension='.txt.gz')

        for sample in files:

            name_sample = sample.split("/")[-1].split(".")[0].split("_")[-1]
            nb_lines = gt.get_nb_lines_file(sample)
            subset_of_lines = random.sample(
                range(nb_lines), int(math.floor(nb_lines * fraction_pass)))

            with gzip.open(sample, "rt") as infile,\
                open(os.path.join(
                        path_output,
                        chrom_name,
                        prefix_subset+name_sample+".txt"),
                     "w") as outfile:

                lines = infile.readlines()

                for index in subset_of_lines:
                    outfile.write(lines[index])
            subprocess.call("gzip {}".format(
                os.path.join(path_output, chrom_name,
                             prefix_subset + name_sample + ".txt")),
                            shell=True)

            if not logging:
                i += 1
                gt.print_progress(i,
                                  len(chromosomes) * len(files) - 1,
                                  decimals=3)
            elif verbose:
                print("{0}/{1} files tested. Date : {2}".format(
                    i,
                    len(chromosomes) * len(files),
                    str(datetime.datetime.now())))

    print("\nData from {0} filtered at {1}. ({2} pass)".format(
        path_data, datetime.datetime.now(), fraction_pass))
cmd_subfolder = os.path.dirname(cmd_subfolder)
try:
    from pydeepgenomics.tools import generaltools as gt
except ImportError:
    if cmd_subfolder not in sys.path:
        sys.path.append(cmd_subfolder)
    from pydeepgenomics.tools import generaltools as gt

if __name__ == "__main__":
    print("Initializating the playground ...")
    PATH_TO_VCF = os.path.join(
        cmd_subfolder,
        "alltests",
        "sim_data",
        "vcf_files")
    PATH_TO_OUTPUT = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "playground")

    if not os.path.isdir(PATH_TO_OUTPUT):
        os.mkdir(PATH_TO_OUTPUT)
    else:
        shutil.rmtree(PATH_TO_OUTPUT)
        os.mkdir(PATH_TO_OUTPUT)

    vcf_files = gt.list_elements(PATH_TO_VCF, type_="file", extension=".vcf.gz")
    for file in vcf_files:
        shutil.copy(file, PATH_TO_OUTPUT)
        name_file = os.path.basename(file)
        copied_file = os.path.join(PATH_TO_OUTPUT, name_file)
Example #15
0
def extract_snps_from_file(
        path_to_vcf_files,
        ref_snps_file,
        sep=",",
        chr_nb_prefix="",
        chr_nb_suffix=".vcf.gz"):

    """
    Load list of snp in a dataframe
    (column1 = snp ref name, column2 = chr, column3 = position)
    Also convert snpref to strings
    """

    # Clean previous results
    subprocess.call(
        "rm -rf " + os.path.join(path_to_vcf_files, "SelectionofSNPs"),
        shell=True)

    ref_snps = pd.read_csv(ref_snps_file, sep=sep)

    # Looking at the files present in the directory before working on them
    vcf_files = gt.list_elements(
        path_to_vcf_files,
        type_="file",
        extension=".vcf.gz")

    # Find the reference SNPs in the actual vcf files
    matching_references, matching_positions = find_matches(vcf_files, ref_snps)

    # Save the list of SNPs found
    if not os.path.isdir(os.path.join(path_to_vcf_files, "SelectionofSNPs")):
        os.mkdir(os.path.join(path_to_vcf_files, "SelectionofSNPs"))
    matching_positions.to_csv(
        path_or_buf=os.path.join(
            path_to_vcf_files,
            "SelectionofSNPs",
            "MatchingPositions.csv"),
        sep="\t",
        index=False)
    matching_references.to_csv(
        path_or_buf=os.path.join(
            path_to_vcf_files,
            "SelectionofSNPs",
            "MatchingReferences.csv"),
        sep="\t",
        index=False)

    for files in vcf_files:

        print("Starting to process file : {}".format(files))
        chrnb = int(re.sub(
            chr_nb_prefix,
            '',
            re.sub(chr_nb_suffix, '', files)))

        if not os.path.isdir(os.path.join(
                path_to_vcf_files, "SelectionofSNPs", "ID")):
            os.mkdir(os.path.join(path_to_vcf_files, "SelectionofSNPs", "ID"))

        if not os.path.isdir(os.path.join(
                path_to_vcf_files, "/SelectionofSNPs", "POS")):
            os.mkdir(os.path.join(path_to_vcf_files, "SelectionofSNPs", "POS"))

        output_file_id = os.path.join(
            path_to_vcf_files,
            "SelectionofSNPs",
            "ID",
            str(chrnb)+"_subset_ID.vcf")
        output_file_pos = os.path.join(
            path_to_vcf_files,
            "SelectionofSNPs",
            "POS",
            str(chrnb) + "_subset_POS.vcf")

        # Copy header and column labels in a subset file
        # (<chrnb>.subset_<type of selction criteria>.vcf) in
        # PATH/SelectionofSNPs

        with gzip.open(files, "r") as fi:
            with open(output_file_id, "w") as fo:
                for line in fi:
                    if line.startswith("#"):
                        fo.write(line)
                    else:
                        break
        subprocess.call(
            "cp {0} {1}".format(output_file_id, output_file_pos),
            shell=True)

        # Filter dataframe to have only snps corresponding to the file

        filtered_match_pos = matching_positions[
            matching_positions[VCF_HEADER[0]] == chrnb]
        filtered_match_ref = matching_references[
            matching_references[VCF_HEADER[0]] == chrnb]

        lines_from_origin_vcf = gzip.open(files, "r").readlines()
        out_pos = open(output_file_pos, 'a')
        out_id = open(output_file_id, 'a')
        it_positions = 0
        it_ids = 0

        print("Extract corresponding positions")
        for line_nb in filtered_match_pos["Corresponding row in vcf file"]:

            out_pos.write(lines_from_origin_vcf[line_nb-1])
            it_positions += 1
            print("Information on {0}/{1} matching positions copied".format(
                it_positions,
                matching_positions.shape[0]))

        print("Extract corresponding references")
        for line_nb in filtered_match_ref["Corresponding row in vcf file"]:
            out_id.write(lines_from_origin_vcf[line_nb-1])
            it_ids += 1
            print("Information on {0}/{1} matching references copied".format(
                it_ids,
                matching_references.shape[0]))

        out_pos.close()
        out_id.close()
        lines_from_origin_vcf.close()
        # compress the output file to .gz
        subprocess.call("gzip {}".format(output_file_pos), shell=True)
        subprocess.call("gzip {}".format(output_file_id), shell=True)
 def test_ls_with_type_dir(self):
     dirs_in_current_folder = self.dirs
     dirs_in_current_folder = gt._natural_sort(dirs_in_current_folder)
     results = gt.list_elements(self.path_to_playground, type_="dir")
     self.assertEqual(results, dirs_in_current_folder)
Example #17
0
import sys

try:
    from pydeepgenomics.tools import generaltools as gt
except ImportError:
    cmd_dir = os.path.abspath(os.path.dirname(__file__)).split("alltests")[0]
    if cmd_dir not in sys.path:
        sys.path.append(cmd_dir)
    from pydeepgenomics.tools import generaltools as gt

if __name__ == "__main__":

    prob_rich_region_to_normal = 0.03
    prob_normal_to_rich_region = 0.03

    files = gt.list_elements(os.path.abspath("."), extension=".vcf")
    header_size = 6

    for index_f, file in enumerate(files):

        size = gt.get_nb_lines_file(file) - header_size
        size_chromosome_a_priori = 200000 / math.pow(1.1, index_f)
        mean_distance = size_chromosome_a_priori / size

        with open(file, "r") as in_file, gzip.open(file + ".gz",
                                                   "wb") as out_file:

            in_rich = False
            position = 0
            for index_l, line in enumerate(in_file):
Example #18
0
def encode_file_positions(
        chr_to_be_processed,
        path_to_data,
        path_to_output,
        name_output_dir="encoded_files",
        verbose=True,
        printing=True,
        logging=False):

    # Start timer
    timer = gt.time_since_first_call()
    next(timer)
    print_parameters = {
        "verbose": verbose,
        "printing": printing,
        "logging": logging,
        "in_loop": False
    }

    gt.custom_output(
        "Function {0} started at {1}".format(
            encode_file_positions.__name__,
            str(datetime.datetime.now())) +
        "\nProcessing files in {0}:".format(path_to_data), **print_parameters)

    chromosome_name = str(chr_to_be_processed)
    _build_output_tree_structure(
        path_to_output,
        name_output_dir,
        chromosome_name)

    # load the meta data in a pandas data frame
    _meta = pd.read_csv(
        os.path.join(path_to_data, "_meta.txt.gz"),
        sep="\t",
        index_col=False)
    list_files = gt.list_elements(
        path_to_data,
        extension=".txt.gz",
        exception=[
            os.path.join(path_to_data, "_meta.txt.gz"),
            os.path.join(path_to_data, "_comments.txt.gz")])

    nb_processed_files = 0
    batch_iter = 0
    list_ = []
    df = _meta.drop(["#CHROM", "ID", "QUAL", "FILTER", "INFO", "FORMAT"], 1)

    for files in list_files:
        print_parameters["in_loop"] = True
        sample_name = files.split("/")[-1].split(".")[0].split("_")[-1]
        list_.append(sample_name)

        df[sample_name] = pd.read_csv(files, index_col=None, header=None)

        if (batch_iter < settings.FILEBATCHSIZE - 1) and \
                (files is not list_files[-1]):
            batch_iter += 1
        else:
            # Reinitialize stuff
            encoded_data = do_conversion(df, list_, output_conversion="to_int")
            write_encoded_output(
                path_to_output,
                chromosome_name,
                encoded_data,
                list_,
                namedir=name_output_dir)
            batch_iter = 0
            list_ = []
            df = _meta.drop(
                ["#CHROM", "ID", "QUAL", "FILTER", "INFO", "FORMAT"], 1)
            nb_processed_files += settings.FILEBATCHSIZE

            h, m, s = next(timer)
            gt.custom_output(
                "\r{0}/{1}".format(nb_processed_files, len(list_files)) +
                " files processed after {0}h{1}m{2}s.".format(h, m, s) +
                " Date: {}".format(str(datetime.datetime.now())),
                **print_parameters)

    print_parameters["in_loop"] = False
    sys.stdout.write("\n")
    h, m, s = next(timer)
    gt.custom_output(
        "Finished after {0}h{1}m{2}s.\n".format(h, m, s),
        **print_parameters)