コード例 #1
0
ファイル: DartPrepare.py プロジェクト: esteinig/dartQC
    def _reindex(self):

        """ Reindex values for non-pythonic input to DartReader (better for users) """

        self.scheme = {k: int(v+1) for k, v in self.scheme.items()}

        for k, v in sorted(self.scheme.items(), key=operator.itemgetter(1)):
            stamp(k, "=", v)

        stamp("Please check these values in your data to ensure correct input for DartQC.")
コード例 #2
0
    def __init__(self, data, attributes):

        QualityControl.__init__(self, data, attributes)

        self.name = "individual"

        self.attributes["modules"][self.name] = {}

        self._set_log()

        stamp("Inititating Sample Module.")
コード例 #3
0
    def _reindex(self):
        """ Reindex values for non-pythonic input to DartReader (better for users) """

        self.scheme = {k: int(v + 1) for k, v in self.scheme.items()}

        for k, v in sorted(self.scheme.items(), key=operator.itemgetter(1)):
            stamp(k, "=", v)

        stamp(
            "Please check these values in your data to ensure correct input for DartQC."
        )
コード例 #4
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def __init__(self, data, attributes):

        QualityControl.__init__(self, data, attributes)

        self.name = "individual"

        self.attributes["modules"][self.name] = {}

        self._set_log()

        stamp("Inititating Sample Module.")
コード例 #5
0
    def _get_pop_results(self):

        try:
            removed = {"monomorphic": self.attributes["modules"]["population"]["results"]["removed"]}
            params = {"monomorphic": self.attributes["modules"]["population"]["settings"]["value"]}
        except KeyError:
            stamp("Could not detect results for Population Module, skipping...")
            params = {"monomorphic": None}
            removed = {"monomorphic": None}

        return params, removed
コード例 #6
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def _get_pop_results(self):

        try:
            removed = {"monomorphic": self.attributes["modules"]["population"]["results"]["removed"]}
            params = {"monomorphic": self.attributes["modules"]["population"]["settings"]["value"]}
        except KeyError:
            stamp("Could not detect results for Population Module, skipping...")
            params = {"monomorphic": None}
            removed = {"monomorphic": None}

        return params, removed
コード例 #7
0
ファイル: DartPrepare.py プロジェクト: esteinig/dartQC
    def _write_scheme(self):

        if self.output_name is None:
            name, ext = os.path.splitext(os.path.basename(self.file_path))
            file_name = name + "_scheme.json"
        else:
            file_name = self.output_name + "_scheme.json"

        out_file = os.path.join(self.output_path, file_name)

        stamp("Writing scheme to:", out_file)

        with open(out_file, "w") as outfile:
            json.dump(self.scheme, outfile, indent=4)
コード例 #8
0
    def _write_scheme(self):

        if self.output_name is None:
            name, ext = os.path.splitext(os.path.basename(self.file_path))
            file_name = name + "_scheme.json"
        else:
            file_name = self.output_name + "_scheme.json"

        out_file = os.path.join(self.output_path, file_name)

        stamp("Writing scheme to:", out_file)

        with open(out_file, "w") as outfile:
            json.dump(self.scheme, outfile, indent=4)
コード例 #9
0
    def _get_preprocessing_results(self):

        try:
            params = {"preprocess": self.attributes["modules"]["preprocessor"]["settings"]["read_count_sum_threshold"],
                      "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["total_calls"],
                      "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["before_missing"]}
            removed = {"preprocess": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"],
                       "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"],
                       "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"]}
        except KeyError:
            stamp("Could not detect results for Preprocessing Module, skipping...")
            params = {"preprocess": None, "calls": None, "missing": None}
            removed = {"preprocess": None, "calls": None, "missing": None}

        return params, removed
コード例 #10
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def _get_preprocessing_results(self):

        try:
            params = {"preprocess": self.attributes["modules"]["preprocessor"]["settings"]["read_count_sum_threshold"],
                      "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["total_calls"],
                      "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["before_missing"]}
            removed = {
                "preprocess": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"],
                "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"],
                "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"]}
        except KeyError:
            stamp("Could not detect results for Preprocessing Module, skipping...")
            params = {"preprocess": None, "calls": None, "missing": None}
            removed = {"preprocess": None, "calls": None, "missing": None}

        return params, removed
コード例 #11
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def _get_sample_results(self):

        try:
            removed = {"mind": self.attributes["individual"]["results"]["mind"]["removed_samples"],
                       "samples": self.attributes["individual"]["results"]["mind"]["removed_samples"]}

            params = {"mind": self.attributes["individual"]["results"]["mind"]["value"],
                      "samples": len(self.attributes["individual"]["states"]["mind"]["sample_names_original"])}

        except KeyError:
            stamp("Could not detect results for Sample Module, skipping...")

            params = {"mind": None, "samples": None}
            removed = {"mind": None, "samples": None}

        return params, removed
コード例 #12
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def _get_redundancy_results(self):

        try:
            parameters = self.attributes["modules"]["redundancy"]["settings"]
            params = {"clusters": parameters["clusters"], "duplicates": parameters["duplicates"],
                      "identity:": parameters["identity"]}
            results = self.attributes["modules"]["redundancy"]["results"]

            removed = {"clusters": results["clusters"]["removed"], "duplicates": results["duplicates"]["removed"],
                       "identity": None}
        except KeyError:
            stamp("Could not detect results for Redundancy Module, skipping...")
            params = {"clusters": None, "duplicates": None, "identity": None}
            removed = {"clusters": None, "duplicates": None, "identity": None}

        return params, removed
コード例 #13
0
    def _get_sample_results(self):

        try:
            removed = {"mind": self.attributes["individual"]["results"]["mind"]["removed_samples"],
                       "samples": self.attributes["individual"]["results"]["mind"]["removed_samples"]}

            params = {"mind": self.attributes["individual"]["results"]["mind"]["value"],
                      "samples": len(self.attributes["individual"]["states"]["mind"]["sample_names_original"])}

        except KeyError:
            stamp("Could not detect results for Sample Module, skipping...")

            params = {"mind": None, "samples": None}
            removed = {"mind": None, "samples": None}

        return params, removed
コード例 #14
0
    def _get_redundancy_results(self):

        try:
            parameters = self.attributes["modules"]["redundancy"]["settings"]
            params = {"clusters": parameters["clusters"], "duplicates": parameters["duplicates"],
                      "identity:": parameters["identity"]}
            results = self.attributes["modules"]["redundancy"]["results"]

            removed = {"clusters": results["clusters"]["removed"], "duplicates": results["duplicates"]["removed"],
                       "identity": None}
        except KeyError:
            stamp("Could not detect results for Redundancy Module, skipping...")
            params = {"clusters": None, "duplicates": None, "identity": None}
            removed = {"clusters": None, "duplicates": None, "identity": None}

        return params, removed
コード例 #15
0
ファイル: DartMessages.py プロジェクト: esteinig/dartQC
    def get_cdhit_message(self, identity, time=True):

        cluster_msg = textwrap.dedent("""
                  CLUSTERING
        -------------------------------

        Running CDHIT-EST...

        Threshold: {0}%

        -------------------------------
        """ .format(identity*100))

        if time:
            stamp("Running CD-HIT at nucleotide identity {0}%".format(identity*100))
        else:
            print(cluster_msg)
コード例 #16
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def _get_snp_results(self):

        """Extract entry from Attributes"""

        try:
            results = self.attributes["modules"]["snp"]["results"]
            parameters = self.attributes["modules"]["snp"]["settings"]["parameters"]

            params = {entry[0]: entry[1] for entry in parameters}
            removed = {param: result["removed"] for param, result in results.items()}

        except KeyError:
            stamp("Could not detect results for SNP Module, skipping...")

            params = {"maf": None, "hwe": None, "call_rate": None, "rep_average": None}
            removed = {"maf": None, "hwe": None, "call_rate": None, "rep_average": None}

        return params, removed
コード例 #17
0
    def get_cdhit_message(self, identity, time=True):

        cluster_msg = textwrap.dedent("""
                  CLUSTERING
        -------------------------------

        Running CDHIT-EST...

        Threshold: {0}%

        -------------------------------
        """.format(identity * 100))

        if time:
            stamp("Running CD-HIT at nucleotide identity {0}%".format(
                identity * 100))
        else:
            print(cluster_msg)
コード例 #18
0
    def _get_snp_results(self):

        """Extract entry from Attributes"""

        try:
            results = self.attributes["modules"]["snp"]["results"]
            parameters = self.attributes["modules"]["snp"]["settings"]["parameters"]

            params = {entry[0]: entry[1] for entry in parameters}
            removed = {param: result["removed"] for param, result in results.items()}

        except KeyError:
            stamp("Could not detect results for SNP Module, skipping...")

            params = {"maf": None, "hwe": None, "call_rate": None, "rep_average": None}
            removed = {"maf": None, "hwe": None, "call_rate": None, "rep_average": None}

        return params, removed
コード例 #19
0
ファイル: DartPrepare.py プロジェクト: esteinig/dartQC
    def _convert_excel(self):

        stamp("Converting from Excel")
        stamp("File is", self.file_path)
        stamp("Sheet is", self.excel_sheet)
        data_xls = pandas.read_excel(self.file_path, self.excel_sheet, index_col=None)
        name, ext = os.path.splitext(os.path.basename(self.file_path))
        outfile = os.path.join(self.output_path, name + ".csv")

        stamp("Writing to file", outfile)
        data_xls.to_csv(outfile, encoding='utf-8', index=False)

        self.file_path = outfile
コード例 #20
0
    def _get_row_indices(self):
        """

        This function gets the row indices for samples and data, as well as the header, assuming:

            - header row begins after rows starting with "*"
            - data row starts after header row
            - samples are specified in the header row (above calls or raw counts)

        """

        stamp("Guessing data configuration:")

        for i, row in self.top.iterrows():
            if row[0] != "*":
                self.header = row
                self.sample_row = i
                self.data_row = i + 1
                self.scheme["sample_row"] = self.sample_row
                self.scheme["data_row"] = self.data_row
                break
コード例 #21
0
    def get_redundancy_message(self,
                               type,
                               initial,
                               removed,
                               retained,
                               time=True):

        redundancy_msg = textwrap.dedent("""
                  REDUNDANCY
        -------------------------------

        {0}

        Initial:    {1}
        Removed:    {2}
        Retained:   {3}

        -------------------------------
        """.format(type.upper(), initial, removed, retained))

        if time:
            stamp("Redundancy module {0}".format(type.upper()))
            stamp("Removed {0} SNPs".format(removed))
            stamp("Retained {0} SNPs".format(retained))
        else:
            print(redundancy_msg)
コード例 #22
0
    def get_filter_message(self,
                           filter,
                           threshold,
                           initial,
                           removed,
                           retained,
                           time=True):

        filter_msg = textwrap.dedent("""
                SNP Filter
        -------------------------------

        {0} at {1}

        Initial:    {2}
        Removed:    {3}
        Retained:   {4}

        -------------------------------
        """.format(filter.upper(), threshold, initial, removed, retained))

        if time:
            stamp("Filtered {0} at {1}".format(filter.upper(), threshold))
            stamp("Removed {0} SNPs".format(removed))
            stamp("Retained {0} SNPs".format(retained))
        else:
            print(filter_msg)
コード例 #23
0
ファイル: DartPrepare.py プロジェクト: esteinig/dartQC
    def _get_row_indices(self):

        """

        This function gets the row indices for samples and data, as well as the header, assuming:

            - header row begins after rows starting with "*"
            - data row starts after header row
            - samples are specified in the header row (above calls or raw counts)

        """

        stamp("Guessing data configuration:")

        for i, row in self.top.iterrows():
            if row[0] != "*":
                self.header = row
                self.sample_row = i
                self.data_row = i+1
                self.scheme["sample_row"] = self.sample_row
                self.scheme["data_row"] = self.data_row
                break
コード例 #24
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def _run_cdhit(self, fasta_path, identity=0.95, word_size=5, description_length=0, cdhit_path=None):

        """ Run CDHIT-EST for sequences, install with sudo apt install cd-hit on Ubuntu """

        self.messages.get_cdhit_message(identity)

        if cdhit_path is None:
            cdhit_path = "cd-hit-est"

        file_name = self.project + "_IdentityClusters_" + str(identity)

        out_file = os.path.join(self.tmp_path, file_name)
        cluster_path = os.path.join(self.tmp_path, file_name + '.clstr')

        stamp("Calling cd-hit-est: " + cdhit_path + " -i " + fasta_path + " -o " + out_file + " -c " + str(identity) +
              " -n " + str(word_size) + " -d " + str(description_length))

        with open(os.devnull, "w") as devnull:
            call([cdhit_path, "-i", fasta_path, "-o", out_file, "-c", str(identity), "-n", str(word_size),
                  "-d", str(description_length)], stdout=devnull)

        return cluster_path
コード例 #25
0
    def _convert_excel(self):

        stamp("Converting from Excel")
        stamp("File is", self.file_path)
        stamp("Sheet is", self.excel_sheet)
        data_xls = pandas.read_excel(self.file_path,
                                     self.excel_sheet,
                                     index_col=None)
        name, ext = os.path.splitext(os.path.basename(self.file_path))
        outfile = os.path.join(self.output_path, name + ".csv")

        stamp("Writing to file", outfile)
        data_xls.to_csv(outfile, encoding='utf-8', index=False)

        self.file_path = outfile
コード例 #26
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def filter_data(self, mind=0.2, recalculate=True):

        """
        Re-write with Pandas
        """

        if mind is None:
            stamp("Returning data without filtering.")
            return self.data, self.attributes

        stamp("Filtering samples with missing data >", mind)
        stamp("Missing data calculated over", len(self.data), "SNPs")

        mind_prop = self._calculate_mind()

        to_remove = mind_prop[mind_prop > mind].index.tolist()

        filtered_data = {}
        for snp, data in self.data.items():
            data["calls"] = [snp_call for i, snp_call in self._iterate_call_indices(data["calls"])
                             if i not in to_remove]
            filtered_data[snp] = data

        attributes = self._adjust_attributes(self.attributes, mind, to_remove)

        percent_removed = format((len(to_remove) / attributes["sample_size"]) * 100, ".2f")

        stamp("Removed {r} samples out of {t} samples ({p}%)"
              .format(r=len(to_remove), t=attributes["sample_size"], p=percent_removed))

        # Recalculating SNP parameters:

        if recalculate:
            stamp("Recalculating MAF, CALL RATE and HWE for SNPs")
            marker = SNPModule(filtered_data, attributes)
            filtered_data, attributes = marker.get_data(threshold=None)

        return filtered_data, attributes
コード例 #27
0
    def filter_data(self, mind=0.2, recalculate=True):

        """
        Re-write with Pandas
        """

        if mind is None:
            stamp("Returning data without filtering.")
            return self.data, self.attributes

        stamp("Filtering samples with missing data >", mind)
        stamp("Missing data calculated over", len(self.data), "SNPs")

        mind_prop = self._calculate_mind()

        to_remove = mind_prop[mind_prop > mind].index.tolist()

        filtered_data = {}
        for snp, data in self.data.items():
            data["calls"] = [snp_call for i, snp_call in self._iterate_call_indices(data["calls"])
                             if i not in to_remove]
            filtered_data[snp] = data

        attributes = self._adjust_attributes(self.attributes, mind, to_remove)

        percent_removed = format((len(to_remove) / attributes["sample_size"])*100, ".2f")

        stamp("Removed {r} samples out of {t} samples ({p}%)"
              .format(r=len(to_remove), t=attributes["sample_size"], p=percent_removed))

        # Recalculating SNP parameters:

        if recalculate:
            stamp("Recalculating MAF, CALL RATE and HWE for SNPs")
            marker = SNPModule(filtered_data, attributes)
            filtered_data, attributes = marker.get_data(threshold=None)

        return filtered_data, attributes
コード例 #28
0
ファイル: DartFileValidation.py プロジェクト: esteinig/dartQC
    def write_seq_vals(self, out_file=None):
        # Add the CSV headers?
        output_csv = [["Cluster #", "Ref Seq", "Cluster Sequences..."]]

        for seq in self.seq_vals:
            type_val = {
                "GOOD": 0,
                "BAD ID": 1,
                "BAD LOC": 2,
                "BAD ID & LOC": 3,
                "UNKNOWN": 4
            }
            seq["sequences"].sort(key=lambda x: type_val[x[0]])
            # sorted_seq = sorted(seq["sequences"], cmp=lambda seq1,seq2: cmp(type_val[seq1[0]], type_val[seq2[0]]))

            row_data = [str(seq["cluster_num"]), seq["ref_seq_str"]] + [str(item) for sublist in seq["sequences"] for
                                                                        item in sublist]

            for item in seq["sequences"]:
                if type_val[item[0]] == 1:
                    sys.stderr.write("WARNING: [" + item[0] + ":" + item[1] + "] " + item[1] + " renamed to " + seq["ref_seq_str"] + "\n")
                if type_val[item[0]] > 1:
                    sys.stderr.write("ERROR: [" + item[0] + ":" + item[1] + "] Unexpected clone ID!  This needs manual fixing\n")

            output_csv.append(row_data)

        if out_file is None:
            out_file = os.path.abspath(
                os.path.join(self.attributes["out_path"], self.attributes["project"] + "_seq_vals.csv"))
        with open(out_file, 'w') as vals_out:
            csv_writer = csv.writer(vals_out, delimiter=",", lineterminator='\n')
            csv_writer.writerows(output_csv)

        print("\n")
        stamp("Sequence ID filtering info written to ", out_file)
        stamp("Look at this file for more information on any ERRORS and WARNINGS")
コード例 #29
0
ファイル: DartWriter.py プロジェクト: blanchedanastasi/dartqc
    def write_json(self, file_name, data_indent=0, attribute_indent=4):

        data_file = os.path.abspath(os.path.join(self.attributes["out_path"], file_name + "_data.json"))
        attribute_file = os.path.abspath(os.path.join(self.attributes["out_path"], file_name + "_attr.json"))

        stamp("Writing data to JSON")
        stamp("Data file:", data_file)
        stamp("Attribute file:", attribute_file)

        with open(data_file, "w") as data_out:
            json.dump(self.data, data_out, indent=data_indent)

        with open(attribute_file, "w") as attr_out:
            json.dump(self.attributes, attr_out, indent=attribute_indent)
コード例 #30
0
ファイル: DartWriter.py プロジェクト: esteinig/dartQC
    def write_json(self, file_name, data_indent=0, attribute_indent=4):

        data_file = os.path.abspath(os.path.join(self.attributes["out_path"], file_name + "_data.json"))
        attribute_file = os.path.abspath(os.path.join(self.attributes["out_path"], file_name + "_attr.json"))

        stamp("Writing data to JSON")
        stamp("Data file:", data_file)
        stamp("Attribute file:", attribute_file)

        with open(data_file, "w") as data_out:
            json.dump(self.data, data_out, indent=data_indent)

        with open(attribute_file, "w") as attr_out:
            json.dump(self.attributes, attr_out, indent=attribute_indent)
コード例 #31
0
ファイル: DartMessages.py プロジェクト: esteinig/dartQC
    def get_filter_message(self, filter, threshold, initial, removed, retained, time=True):

        filter_msg = textwrap.dedent("""
                SNP Filter
        -------------------------------

        {0} at {1}

        Initial:    {2}
        Removed:    {3}
        Retained:   {4}

        -------------------------------
        """ .format(filter.upper(), threshold, initial, removed, retained))

        if time:
            stamp("Filtered {0} at {1}".format(filter.upper(), threshold))
            stamp("Removed {0} SNPs".format(removed))
            stamp("Retained {0} SNPs".format(retained))
        else:
            print(filter_msg)
コード例 #32
0
ファイル: DartMessages.py プロジェクト: esteinig/dartQC
    def get_redundancy_message(self, type, initial, removed, retained, time=True):

        redundancy_msg = textwrap.dedent("""
                  REDUNDANCY
        -------------------------------

        {0}

        Initial:    {1}
        Removed:    {2}
        Retained:   {3}

        -------------------------------
        """ .format(type.upper(), initial, removed, retained))

        if time:
            stamp("Redundancy module {0}".format(type.upper()))
            stamp("Removed {0} SNPs".format(removed))
            stamp("Retained {0} SNPs".format(retained))
        else:
            print(redundancy_msg)
コード例 #33
0
ファイル: DartWriter.py プロジェクト: blanchedanastasi/dartqc
    def write_plink(self, file_name, sep="\t", remove_space=False):

        snp_order = sorted(self.data.keys())

        stamp("Decoding calls...")
        snp_rows = [[self.decoding_scheme[snp] for snp in self.data[snp_id]["calls"]] for snp_id in snp_order]

        stamp("Transposing calls...")
        snps_by_sample = numpy.asarray(snp_rows).transpose(1, 0, 2)

        genotypes = [sample.flatten().tolist() for sample in snps_by_sample]

        names = self.attributes["sample_names"]
        pops = [self.attributes["pops"][sample] for sample in names]

        if remove_space:
            names = ["_".join(name.split()) for name in names]
            pops = ["_".join(pop.split()) for pop in pops]

        ped_file = os.path.join(self.attributes["out_path"], file_name + '.ped')
        map_file = os.path.join(self.attributes["out_path"], file_name + '.map')

        paternal = ["0"] * len(names)
        maternal = ["0"] * len(names)
        sex = ["0"] * len(names)
        phenotype = ["-9"] * len(names)

        plink = zip(pops, names, paternal, maternal, sex, phenotype, genotypes)

        stamp("Formatting calls...")

        ped_data = []
        for row in plink:
            new_row = list(row[:6])
            for geno in row[6]:
                new_row.append(geno)
            ped_data.append(new_row)

        stamp("Writing PLINK")
        stamp("PED file:", ped_file)
        stamp("MAP file:", map_file)

        with open(ped_file, 'w') as ped_out:
            ped_writer = csv.writer(ped_out, delimiter=sep)
            ped_writer.writerows(ped_data)

        # MAP Formatting

        map_data = [["0", snp_id, "0", "0"] for snp_id in snp_order]

        with open(map_file, 'w') as map_out:
            ped_writer = csv.writer(map_out, delimiter=sep)
            ped_writer.writerows(map_data)
コード例 #34
0
    def check_concordance(self):

        if set(self.sample_names) != set(self.call_names):
            stamp(
                "Sample names from the read count file are not the same as sample names from the data file."
            )
            stamp("Sample difference, present in one but not the other data:")
            for sample in set(self.sample_names).difference(
                    set(self.call_names)):
                stamp(sample)
            exit(0)
        else:
            stamp(
                "Concordance between sample names in call and count data, all is good."
            )

        if len(self.call_data) != len(self.data):

            diff = set(self.call_data.keys()).difference(set(self.data.keys()))
            inter = set(self.call_data.keys()).intersection(
                set(self.data.keys()))

            stamp("Number of SNPs are different, there are:",
                  len(self.call_data), "SNPs in the called set and",
                  len(self.data), "SNPs in the raw set.")

            stamp(len(diff),
                  "SNPs have a different ID. Keeping the intersection of",
                  len(inter), "SNPs...")

            self.data = {k: v for (k, v) in self.data.items() if k in inter}
            self.call_data = {
                k: v
                for (k, v) in self.call_data.items() if k in inter
            }

        if set(self.call_data.keys()) != set(self.data.keys()):
            stamp(
                "SNP IDs are not the same, removal not effective, please re-format your data."
            )
コード例 #35
0
    def _read_csv(self):

        stamp("Loading file", self.file_path)

        self.top = pandas.read_csv(self.file_path, header=None, nrows=30)
コード例 #36
0
ファイル: DartWriter.py プロジェクト: esteinig/dartQC
    def write_plink(self, file_name, sep="\t", remove_space=False):

        snp_order = sorted(self.data.keys())

        stamp("Decoding calls...")
        snp_rows = [[self.decoding_scheme[snp] for snp in self.data[snp_id]["calls"]] for snp_id in snp_order]

        # Update to output actual ACGT values for alleles rather than just A or B - this maintains the most info.

        stamp("Transposing calls...")
        snps_by_sample = numpy.asarray(snp_rows).transpose(1, 0, 2)

        genotypes = [sample.flatten().tolist() for sample in snps_by_sample]

        names = self.attributes["sample_names"]
        pops = [self.attributes["pops"][sample] for sample in names]

        if remove_space:
            names = ["_".join(name.split()) for name in names]
            pops = ["_".join(pop.split()) for pop in pops]

        ped_file = os.path.join(self.attributes["out_path"], file_name + '.ped')
        map_file = os.path.join(self.attributes["out_path"], file_name + '.map')

        paternal = ["0"] * len(names)
        maternal = ["0"] * len(names)
        sex = ["0"] * len(names)
        phenotype = ["-9"] * len(names)

        plink = zip(pops, names, paternal, maternal, sex, phenotype, genotypes)

        stamp("Formatting calls...")

        ped_data = []
        for row in plink:
            new_row = list(row[:6])
            for geno in row[6]:
                new_row.append(geno)
            ped_data.append(new_row)

        stamp("Writing PLINK")
        stamp("PED file:", ped_file)
        stamp("MAP file:", map_file)

        with open(ped_file, 'w') as ped_out:
            ped_writer = csv.writer(ped_out, delimiter=sep)
            ped_writer.writerows(ped_data)

        # MAP Formatting

        map_data = [["0", snp_id, "0", "0"] for snp_id in snp_order]

        with open(map_file, 'w') as map_out:
            ped_writer = csv.writer(map_out, delimiter=sep)
            ped_writer.writerows(map_data)
コード例 #37
0
ファイル: DartModules.py プロジェクト: esteinig/dartQC
    def get_data(self, mono="all", comparison="=="):

        stamp("Initialised Population Module")

        if mono is None:
            stamp("No filter specified, returning data.")
            return self.data, self.attributes

        stamp("Indexing monomorphic SNPs in each population")
        self._calculate_monomorphics()

        for pop, indices in self.populations.items():
            stamp("There are", len(indices), "samples in population", pop)

        for pop, monomorphs in self.monomorphics.items():
            stamp("There are", len(monomorphs), "monomorphic SNPs in population", pop)

        # If threshold is string 'all', set to all populations.

        stamp("Filtering SNPs that are monomorphic in", mono, "populations.")

        if mono == "all":
            mono = len(self.populations)

        if comparison == "==":
            filtered = {snp: data for snp, data in self.data.items() if data["mono"] == mono}
        elif comparison == ">=":
            filtered = {snp: data for snp, data in self.data.items() if data["mono"] >= mono}
        elif comparison == "<=":
            filtered = {snp: data for snp, data in self.data.items() if data["mono"] <= mono}
        else:
            raise ValueError("Comparison must be one of: <=, >=, ==")

        filtered_data = {snp: data for snp, data in self.data.items() if snp not in filtered}

        stamp("Filtered", len(filtered), "SNPs.")

        attributes = self._log_monomorphic(self.attributes, filtered_data, mono)

        return filtered_data, attributes
コード例 #38
0
ファイル: DartPrepare.py プロジェクト: esteinig/dartQC
    def _read_csv(self):

        stamp("Loading file", self.file_path)

        self.top = pandas.read_csv(self.file_path, header=None, nrows=30)
コード例 #39
0
    def get_data(self, mono="all", comparison="=="):

        stamp("Initialised Population Module")

        if mono is None:
            stamp("No filter specified, returning data.")
            return self.data, self.attributes

        stamp("Indexing monomorphic SNPs in each population")
        self._calculate_monomorphics()

        for pop, indices in self.populations.items():
            stamp("There are", len(indices), "samples in population", pop)

        for pop, monomorphs in self.monomorphics.items():
            stamp("There are", len(monomorphs), "monomorphic SNPs in population", pop)

        # If threshold is string 'all', set to all populations.

        stamp("Filtering SNPs that are monomorphic in", mono, "populations.")

        if mono == "all":
            mono = len(self.populations)

        if comparison == "==":
            filtered = {snp: data for snp, data in self.data.items() if data["mono"] == mono}
        elif comparison == ">=":
            filtered = {snp: data for snp, data in self.data.items() if data["mono"] >= mono}
        elif comparison == "<=":
            filtered = {snp: data for snp, data in self.data.items() if data["mono"] <= mono}
        else:
            raise ValueError("Comparison must be one of: <=, >=, ==")

        filtered_data = {snp: data for snp, data in self.data.items() if snp not in filtered}

        stamp("Filtered", len(filtered), "SNPs.")

        attributes = self._log_monomorphic(self.attributes, filtered_data, mono)

        return filtered_data, attributes
コード例 #40
0
ファイル: DartProcessor.py プロジェクト: esteinig/dartQC
    def check_concordance(self):

        if set(self.sample_names) != set(self.call_names):
            stamp("Sample names from the read count file are not the same as sample names from the data file.")
            stamp("Sample difference, present in one but not the other data:")
            for sample in set(self.sample_names).difference(set(self.call_names)):
                stamp(sample)

            raise SimpleException("Sample names in data & read count files don't match.\n"
                                  + "\t\t- Check if the read_counts (and data) sample row is set correctly")
        else:
            stamp("Concordance between sample names in call and count data, all is good.")

        if len(self.call_data) != len(self.data):

            diff = set(self.call_data.keys()).difference(set(self.data.keys()))
            inter = set(self.call_data.keys()).intersection(set(self.data.keys()))

            stamp("Number of SNPs are different, there are:", len(self.call_data), "SNPs in the called set and",
                  len(self.data), "SNPs in the raw set.")

            stamp(len(diff), "SNPs have a different ID. Keeping the intersection of", len(inter), "SNPs...")

            self.data = {k: v for (k, v) in self.data.items() if k in inter}
            self.call_data = {k: v for (k, v) in self.call_data.items() if k in inter}

        if set(self.call_data.keys()) != set(self.data.keys()):
            stamp("SNP IDs are not the same, removal not effective, please re-format your data.")
コード例 #41
0
    def filter_read_counts(self, threshold=7):
        """
        1. Transform read count matrix to numpy array, ordered by allele IDs.
        2. Sum-collapse replicate columns in the order of sample names from the original data (sample_names)
        3. Construct the reduced array and assign each call in a dictionary the allele ID
        4. For each allele in the dictionary, construct a boolean vector with True if the sum of the two allele counts
           is smaller than the threshold value, otherwise False
        5. Use this vector in the same iteration to assign missing to all calls in the original data

        """

        self.check_concordance()
        call_missing = self.get_missing()
        stamp("Number of missing in call data:", call_missing)

        snp_order = sorted(self.data.keys())
        reduced_counts = {}

        stamp("Finding replicate columns...")

        self.get_replicates()

        stamp("Ordering count data by SNPs...")

        counts = [self.data[snp]["calls"] for snp in snp_order]

        count_array = numpy.asarray(counts)

        stamp("Sum-collapsing replicates...")

        columns = [
            numpy.sum(count_array[:, self.replicates[sample]],
                      axis=1).tolist() for sample in self.call_names
        ]

        reduced_array = list(zip(*columns))

        for i, snp in enumerate(snp_order):
            reduced_counts[snp] = reduced_array[i]

        replaced = 0
        total = 0

        stamp("Replacing low counts with missing...")

        for snp, counts in reduced_counts.items():
            filter_vector = [
                False if sum(allele_counts) <= threshold else True
                for allele_counts in counts
            ]
            self.call_data[snp]["calls"] = [
                call if filter_vector[i] else "-"
                for i, call in enumerate(self.call_data[snp]["calls"])
            ]

            replaced += filter_vector.count(False)
            total += len(filter_vector)

        replaced -= call_missing

        stamp("Pre-processing silenced {r}/{t} calls {p}".format(
            r=replaced, t=total, p=format((replaced / total) * 100, ".2f")))

        self.call_attributes["modules"][self.name]["results"] = {
            "total_calls": total,
            "replaced_calls": replaced,
            "before_missing": call_missing,
            "after_missing": call_missing + replaced
        }

        self.call_attributes["modules"][self.name]["settings"] = {
            "read_count_sum_threshold": threshold
        }
コード例 #42
0
ファイル: DartProcessor.py プロジェクト: esteinig/dartQC
    def filter_read_counts(self, threshold=[7]):

        """
        1. Transform read count matrix to numpy array, ordered by allele IDs.
        2. Sum-collapse replicate columns in the order of sample names from the original data (sample_names)
        3. Construct the reduced array and assign each call in a dictionary the allele ID
        4. For each allele in the dictionary, construct a boolean vector with True if the sum of the two allele counts
           is smaller than the threshold value, otherwise False
        5. Use this vector in the same iteration to assign missing to all calls in the original data

        """

        self.check_concordance()
        call_missing = self.get_missing()
        stamp("Number of missing in call data:", call_missing)

        snp_order = sorted(self.data.keys())
        reduced_counts = {}

        stamp("Finding replicate columns...")

        self.get_replicates()

        stamp("Ordering count data by SNPs...")

        counts = [self.data[snp]["calls"] for snp in snp_order]

        count_array = numpy.asarray(counts)

        stamp("Sum-collapsing replicates...")

        for idx, aCounts in enumerate(count_array):
            if len(aCounts) < 2:
                raise SimpleException("Invalid read counts data for allele " + snp_order[idx] + " - is there only 1 row?")

        columns = [numpy.sum(count_array[:, self.replicates[sample]], axis=1).tolist() for sample in self.call_names]

        reduced_array = list(zip(*columns))

        for i, snp in enumerate(snp_order):
            reduced_counts[snp] = reduced_array[i]


        stamp("Replacing low counts with missing...")

        all_call_data = []
        all_call_attrs = []
        all_filtered = []

        # If not graphing there is no point filtering all values given, so just take the first threshold value
        if not self.graph:
            threshold = [threshold[0]]
        else:
            DartGraphs.create_static_plots(self.call_data, self.data, self.out_path, self.project)
            DartGraphs.create_plots(self.call_data, self.data, self.call_attributes, "original", self.out_path, self.project, "red")
            # pass

        for call_thresh in threshold:
            replaced = 0
            total = 0
            call_data = {}
            call_attrs = copy(self.call_attributes)
            call_attrs["modules"] = {self.name: {}}
            filtered = {}

            for snp, counts in reduced_counts.items():
                filter_vector = [False if sum(allele_counts) <= call_thresh else True for allele_counts in counts]
                call_data[snp] = copy(self.call_data[snp])
                call_data[snp]["calls"] = [call if filter_vector[i] else "-" for i, call in
                                                enumerate(self.call_data[snp]["calls"])]

                total += len(filter_vector)
                replaced += filter_vector.count(False)

                filtered[snp] = filter_vector;


            replaced -= call_missing
            stamp("Pre-processing silenced {r}/{t} calls {p}% using call threshold {c}".format(r=replaced, t=total,
                                                                     p=format((replaced/total)*100, ".2f"), c=call_thresh))

            call_attrs["modules"][self.name]["results"] = {
                "total_calls": total,
                "replaced_calls": replaced,
                "before_missing": call_missing,
                "after_missing": call_missing + replaced
            }

            call_attrs["modules"][self.name]["settings"] = {
                "read_count_sum_threshold": call_thresh
            }

            all_call_attrs.append(call_attrs)
            all_call_data.append(call_data)
            all_filtered.append(filtered)

        self.call_data = all_call_data[0]
        self.call_attributes = all_call_attrs[0]
        self.filtered = all_filtered[0]

        if self.graph:
            DartGraphs.create_plots(all_call_data, self.data, all_call_attrs, "threshold", self.out_path, self.project, "orange", legend=[("Threshold " + str(thresh)) for thresh in threshold])